#!/usr/bin/R # ----------------------------------- # Simple cross-cohort analysis # Updated: 01/26/22 # ----------------------------------- library(cbrbase) set_proj('DEVTRAJ') # Libraries: library(tidyr) library(ggplot2) library(ggpubr) library(RColorBrewer) # Load in and process data (saves to matrices): # project = 'ALZ_repl' project = 'ALZ' commandArgs <- function(){ c(project)} source(paste0(bindir, 'mosaicism/load_metadata.R')) # Make directories for analysis: analysis = 'fusion' imgdir = paste0(img, project, '/', analysis , '/') imgpref = paste0(imgdir, analysis, "_mspred_", project, '_') cmd = paste0('mkdir -p ', imgdir) system(cmd) # Load in the two cohorts of candidate gene fusion counts: # -------------------------------------------------------- fs1dir = paste0(dbdir,'variants/', 'ALZ', '/fusion/') fs2dir = paste0(dbdir,'variants/', 'ALZ_repl', '/fusion/') filesuff = 'maxsensitivity_prediction_numbers_nochrM' o1file = paste0(fs1dir, 'ALZ', '_', filesuff, '_wmeta.tsv') o2file = paste0(fs2dir, 'ALZ_repl', '_', filesuff, '_wmeta.tsv') ct1df = read.delim(o1file, header=T) ct2df = read.delim(o2file, header=T) ct1df$cohort = 'ALZ' ct2df$cohort = 'ALZ_repl' ctdf = rbind(ct1df, ct2df) # NOTE: Signal is there on 10k + full, but PMI effect is larger sub.ctdf = ctdf[ctdf$n_counts >= 25000,] # Run regressions to estimate the effect of AD / non-AD: # ------------------------------------------------------ processLmFit = function(fit){ cfit = coefficients(summary(fit)) cfit = data.frame(cfit) names(cfit) = c('Est','SE','t','p') cfit$coef = rownames(cfit) return(cfit) } advars = c('niareagansc','nrad','braaksc','bd', 'bd2','cogdx.ad') covars = c('age_death','msex','pmi', 'Apoe_e4') test.vars= c('cogdx.ad','bd','bd2','nrad', covars) resdf = c() ext.covar = '+ pmi + msex + Apoe_e4 + age_death + cohort' only.exc = TRUE # only.exc = FALSE for (var in test.vars){ # n_counts better models than log(n_counts) form = asform(c('count ~ ', var, '+ n_counts + relabel', ext.covar)) form.ct = asform(c('count ~ ', var, '+ n_counts ', ext.covar)) if (only.exc){ fit = lm(form.ct, sub.ctdf[sub.ctdf$relabel == 'Excitatory',]) } else { fit = lm(form, sub.ctdf) } cfit = processLmFit(fit) cfit$var = var resdf = rbind(resdf, cfit[2,]) } resdf = resdf[order(resdf$p),] resdf$col = with(resdf, ifelse(p < 0.05, ifelse(Est < 0, 1, 2), 0)) resdf = resdf[order(resdf$Est),] resdf$coef = factor(resdf$coef, levels=resdf$coef) pcols = brewer.pal(12, 'Paired') devals = c('0'='grey80', '1'=pcols[2], '2'=pcols[6]) gp = ggplot(resdf, aes(coef, Est, label=p, fill=factor(col))) + geom_bar(stat='identity') + scale_fill_manual(values=devals) + theme_pubr() gp = ggplot(resdf, aes(coef, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + geom_point() + geom_errorbar() + geom_hline(yintercept=0, lty='dotted') + scale_color_manual(values=devals) + theme_pubr() + coord_flip() + theme(legend.position='none') # Plots of candidate counts: # -------------------------- # TODO: plots split by cohort # Plot the number of candidates vs. the number of reads: gp = ggplot(sub.ctdf, aes(n_counts, count, color=cogdx.ad)) + facet_wrap(~relabel, scale='free') + geom_point() + geom_smooth(method='lm') + stat_cor(color='black', label.y.npc=1) + stat_cor(label.y.npc=0.8) + scale_x_log10() + scale_color_manual(values=colvals[['cogdx.ad']], name='AD Cognition') + theme_pubr() # Plot the number of candidates vs. the number of reads: gp = ggplot(sub.ctdf, aes(n_counts, count, color=relabel)) + geom_point() + geom_smooth(method='lm') + stat_cor(color='black', label.y.npc=1) + stat_cor(label.y.npc=0.8) + scale_x_log10() + theme_pubr() # Plot the number of candidates vs. the number of reads (with BRAAK: gp = ggplot(sub.ctdf, aes(n_counts, count, color=bd)) + facet_wrap(~relabel, scale='free') + geom_point() + geom_smooth(method='lm') + stat_cor(color='black', label.y.npc=1) + stat_cor(label.y.npc=0.8) + scale_x_log10() + scale_color_manual(values=c('royalblue','indianred'), name='Braak Stage') + theme_pubr() # Plot the number of candidates vs. the read count bins: gp = ggplot(ctdf, aes(lng.bin, count, fill=cogdx.ad)) + facet_wrap(~relabel, scale='free') + geom_boxplot() + stat_compare_means() + scale_fill_manual(values=colvals[['cogdx.ad']], name='AD Cognition') + theme_pubr() # Also plot with normalized count / n_counts: gp = ggplot(ctdf, aes(lng.bin, count / n_counts, fill=cogdx.ad)) + facet_wrap(~relabel, scale='free') + geom_boxplot() + stat_compare_means() + scale_fill_manual(values=colvals[['cogdx.ad']], name='AD Cognition') + theme_pubr() # NOTE: the high effect size of PMI for these may be due to increased MT / RP fusions (artifactual, with paralogs/homologs) --> these are filtered out in the filtering steps. --> YES. FIXED.