#!/usr/bin/R # ---------------------------------------------- # Join the fusions with the metadata: # Updated: 01/26/22 # ---------------------------------------------- library(cbrbase) set_proj('DEVTRAJ') # Libraries: library(tidyr) library(ggplot2) library(ggpubr) # Load in and process data (saves to matrices): project = 'ALZ' # project = 'ALZ_repl' commandArgs <- function(){ c(project)} source(paste0(bindir, 'mosaicism/load_metadata.R')) use.prelim = FALSE # Make directories for analysis: analysis = 'fusion' imgdir = paste0(img, project, '/', analysis , '/') fsdir = paste0(dbdir,'variants/', project, '/fusion/') if (use.prelim){ imgpref = paste0(imgdir, analysis, "_prelimcts_", project, '_') } else { imgpref = paste0(imgdir, analysis, "_mspred_", project, '_') } cmd = paste0('mkdir -p ', imgdir) system(cmd) # Update the metadata for merging with the fusions: # ------------------------------------------------- meta = merge(meta, ext.meta[,c('projid','niareagansc', 'pmi')]) meta$bd = '1-4' meta$bd[meta$braaksc %in% c('5','6')] = '5-6' meta$bd2 = NA meta$bd2[meta$braaksc %in% c('0','1','2')] = 'CTRL' meta$bd2[meta$braaksc %in% c('5','6')] = 'AD' meta$nrad = 'CTRL' meta$nrad[meta$niareagansc %in% c('1','2')] = 'AD' meta$Apoe_e4 = ifelse(meta$apoe_genotype %in% c('24','44','34'), 'Yes','No') if (project == 'ALZ') { meta$plate = meta$projid } # For plot against quality, bin the number of genes: lng = log(meta$n_counts) bks = seq(min(lng), max(lng), length.out=10) meta$lng.bin = cut(lng, bks, include.lowest=T) # Variable sets: advars = c('niareagansc','nrad','braaksc','bd', 'bd2','cogdx.ad') covars = c('age_death','msex','pmi', 'Apoe_e4') cellvars = c('percent_mito','percent_ribo','count_per_gene', 'n_counts','n_genes', 'lng.bin', 'relabel') # Read in data of preliminary or predicted fusion counts: # ------------------------------------------------------- if (use.prelim){ filesuff = 'prelim_fusion_cts' ctdf = read.delim(paste0(fsdir,project, '_', filesuff, '.tsv'), header=F) names(ctdf) = c('count','plate','sample') } else { filesuff = 'maxsensitivity_prediction_numbers' fssuff = 'maxsensitivity_abridged_fusions_filtered' # Load list of all measured cells: ctdf = read.delim(paste0(fsdir,project, '_', filesuff, '.tsv'), header=F) names(ctdf) = c('count','plate','sample') ctdf$count = NULL # Reduce the filtered fragments and join fsdf = read.delim(paste0(fsdir,project, '_', fssuff, '.tsv'), header=T) colnames(fsdf)[colnames(fsdf) == 'projid'] = 'plate' # Aggregate raw counts: fsctdf = agg.rename(annots ~ plate + sample, fsdf, length, 'count') ctdf = merge(ctdf, fsctdf, all.x=TRUE) ctdf$count[is.na(ctdf$count)] = 0 } # Merge the fusion counts with the metadata: # ------------------------------------------ mcols = c('plate','sample', advars, covars, cellvars) ctdf = merge(ctdf, unique(meta[,mcols])) ctdf$count1 = 1 * (ctdf$count > 0) ctdf$ind = 1 outfile = paste0(fsdir, project, '_', filesuff, '_wmeta.tsv') write.table(ctdf, outfile, quote=F, sep='\t', row.names=F) # NOTE: Not as reliable by counts under this threshold, regr may be ok sub.ctdf = ctdf[ctdf$n_counts >= 25000,] # Simple counts: # -------------- twide = aggregate(cbind(count, ind) ~ cogdx.ad, ctdf, sum) # twide = aggregate(cbind(count, ind) ~ bd, ctdf, sum) # twide = aggregate(cbind(count, ind) ~ bd2, ctdf, sum) # twide = aggregate(cbind(count, ind) ~ nrad, ctdf, sum) tab = as.matrix(twide[,-1]) fisher.test(tab) # Plot the number of candidates vs. the number of reads: gp = ggplot(sub.ctdf, aes(n_counts, count, color=cogdx.ad)) + facet_wrap(~relabel, scale='free') + geom_point() + geom_smooth(method='lm') + stat_cor(color='black', label.y.npc=1) + stat_cor(label.y.npc=0.8) + scale_x_log10() + scale_color_manual(values=colvals[['cogdx.ad']], name='AD Cognition') + theme_pubr() # Plot the number of candidates vs. the number of reads: gp = ggplot(sub.ctdf, aes(n_counts, count, color=relabel)) + geom_point() + geom_smooth(method='lm') + stat_cor(color='black', label.y.npc=1) + stat_cor(label.y.npc=0.8) + scale_x_log10() + theme_pubr() # Plot the number of candidates vs. the number of reads (with BRAAK: gp = ggplot(sub.ctdf, aes(n_counts, count, color=bd)) + facet_wrap(~relabel, scale='free') + geom_point() + geom_smooth(method='lm') + stat_cor(color='black', label.y.npc=1) + stat_cor(label.y.npc=0.8) + scale_x_log10() + scale_color_manual(values=c('royalblue','indianred'), name='Braak Stage') + theme_pubr() # Plot the number of candidates vs. the read count bins: gp = ggplot(ctdf, aes(lng.bin, count, fill=cogdx.ad)) + facet_wrap(~relabel, scale='free') + geom_boxplot() + stat_compare_means() + scale_fill_manual(values=colvals[['cogdx.ad']], name='AD Cognition') + theme_pubr() # Also plot with normalized count / n_counts: gp = ggplot(ctdf, aes(lng.bin, count / n_counts, fill=cogdx.ad)) + facet_wrap(~relabel, scale='free') + geom_boxplot() + stat_compare_means() + scale_fill_manual(values=colvals[['cogdx.ad']], name='AD Cognition') + theme_pubr() # Run regressions to estimate the effect of AD / non-AD: # ------------------------------------------------------ processLmFit = function(fit){ cfit = coefficients(summary(fit)) cfit = data.frame(cfit) names(cfit) = c('Est','SE','t','p') cfit$coef = rownames(cfit) return(cfit) } test.vars= c('cogdx.ad','bd','bd2','nrad', covars, cellvars) resdf = c() ext.covar = '+ pmi + msex + Apoe_e4 + age_death + percent_mito + percent_ribo + count_per_gene' for (var in test.vars){ # n_counts better models than log(n_counts) form = asform(c('count ~ ', var, '+ n_counts + relabel', ext.covar)) form.ct = asform(c('count ~ ', var, '+ n_counts ', ext.covar)) fit = lm(form, sub.ctdf) # fit = lm(form.ct, sub.ctdf[sub.ctdf$relabel == 'Excitatory',]) cfit = processLmFit(fit) cfit$var = var resdf = rbind(resdf, cfit[2,]) } resdf = resdf[order(resdf$p),] # NOTE: the high effect size of PMI for these may be due to increased MT / RP fusions (artifactual, with paralogs/homologs) --> these are filtered out in the filtering steps. --> YES. FIXED.