#!/usr/bin/R # ------------------------------------------------ # Estimate effect sizes for gene candidate fusions # Updated: 01/26/22 # ------------------------------------------------ library(cbrbase) set_proj('DEVTRAJ') # Libraries: library(tidyr) library(ggplot2) library(MASS) library(ggpubr) library(RColorBrewer) options(width=170) # Load in and process data (saves to matrices): # project = 'ALZ_repl' project = 'ALZ' commandArgs <- function(){ c(project)} source(paste0(bindir, 'mosaicism/load_metadata.R')) # Make directories for analysis: analysis = 'fusion' imgdir = paste0(img, project, '/', analysis , '/') imgpref = paste0(imgdir, analysis, "_mspred_", project, '_') cmd = paste0('mkdir -p ', imgdir) system(cmd) # Load in the two cohorts of candidate gene fusion counts: # -------------------------------------------------------- source(paste0(bindir, 'mosaicism/genefusions/load_fusion_data.R')) source(paste0(bindir, 'mosaicism/genefusions/auxiliary_fusion_plotting.R')) # Save the ALZ cohort: outpref = paste0(dbdir,'variants/human_smart-seq2_fusions.') filesuff = 'maxsensitivity_prediction_numbers' outfile = paste0(outpref, filesuff, '.meta.counts.tsv') if (!file.exists(outfile)){ coh.ctdf = ctdf[ctdf$cohort == 'ALZ',] # Remove extraneous variables rmcols = c('count1', 'ind', 'capped_count', 'capped_count', 'rate', 'lng.bin') for (rmcol in rmcols){ coh.ctdf[[rmcol]] = NULL } write.table(coh.ctdf, outfile, quote=F, sep="\t", row.names=F) } # Both cohorts together --> second is removed due to sequencing issues. fulldf = c() reg.var = 'count' for (only.exc in c(TRUE, FALSE)){ if (only.exc){ df = sub.ctdf[sub.ctdf$relabel == 'Excitatory',] } else { df = sub.ctdf } resdf = c() for (var in test.vars){ # Original model, with + n_counts + n_genes (kept here for record) # form.vec = c(reg.var, ' ~ ', var, count.covar, ext.covar) # Final model, with offset(log(n_counts)) as well as log(n_genes) count.covar = '+ log(n_genes) + count_per_gene' form.vec = c(reg.var, ' ~ offset(log(n_counts)) + ', var, count.covar, ext.covar) if (!only.exc){ form.vec = c(form.vec, ' + relabel') } form = asform(form.vec) if (reg.var %in% c('capped_count','count')){ # fit = glm(form, df, family='poisson') fit = glm.nb(form, df) } else if (reg.var %in% c('has_fusion')){ fit = glm(form, df, family='binomial') } else { fit = glm(form, df, family='gaussian') } cfit = processLmFit(fit) cfit$var = var resdf = rbind(resdf, cfit[2,]) } resdf = resdf[order(resdf$p),] resdf$padj = p.adjust(resdf$p, method='fdr') resdf$col = with(resdf, ifelse(padj < 0.05, ifelse(Est < 0, 'Down', 'Up'), 'NS')) resdf = resdf[order(resdf$Est),] resdf$coef = varmap[resdf$coef] resdf$coef = factor(resdf$coef, levels=resdf$coef) # Plot prefixes: regpref = paste0(imgpref, 'reg_effectsizes_core_') title = paste('Principal variables') if (only.exc){ regpref = paste0(regpref, 'exconly_') title = paste(title, '(Exc. only)') } # Plot effect sizes as error bars gp = coefErrorbarPlot(resdf, title=title) saveGGplot(gp, paste0(regpref, 'errorbar'), w=5, h=3) resdf$cohort = 'Joint' resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells') fulldf = rbind(fulldf, resdf) } # Evaluate regressions in each cohort separately: # ----------------------------------------------- coh.covar = '+ pmi_rescaled + msex + age_rescaled' for (cellset in c('All', 'Exc', 'noUD')){ if (cellset == 'Exc'){ only.exc = TRUE } else { only.exc = FALSE } for (cohort in c('ALZ','ALZ_repl')){ coh.ctdf = sub.ctdf[sub.ctdf$cohort == cohort,] if (cohort == 'ALZ'){ count.covar = '+ n_counts + n_genes + count_per_gene' } else { count.covar = '+ platform * n_counts + platform * n_genes + platform * count_per_gene' } if (only.exc){ df = coh.ctdf[coh.ctdf$relabel == 'Excitatory',] } else if (cellset == 'noUD'){ df = coh.ctdf[coh.ctdf$relabel != 'Undefined/Dying',] } else { df = coh.ctdf } resdf = c() for (var in test.vars){ # Original model, with + n_counts + n_genes (kept here for record) # count.covar = '+ n_counts + n_genes + count_per_gene' # form.vec = c(reg.var, ' ~ ', var, count.covar, coh.covar) # Final model, with offset(log(n_counts)) as well as log(n_genes) count.covar = '+ log(n_genes) + count_per_gene' form.vec = c(reg.var, ' ~ offset(log(n_counts)) + ', var, count.covar, coh.covar) if (!only.exc){ form.vec = c(form.vec, ' + relabel') } form = asform(c(form.vec)) if (reg.var %in% c('capped_count','count')){ # fit = glm(form, df, family='poisson') # Original model fit = glm.nb(form, df) } else if (reg.var %in% c('has_fusion')){ fit = glm(form, df, family='binomial') } else { fit = glm(form, df, family='gaussian') } cfit = processLmFit(fit) cfit$var = var resdf = rbind(resdf, cfit[2,]) } resdf = resdf[order(resdf$p),] resdf$padj = p.adjust(resdf$p, method='fdr') resdf$col = with(resdf, ifelse(padj < 0.05, ifelse(Est < 0, 'Down', 'Up'), 'NS')) resdf = resdf[order(resdf$Est),] resdf$coef = varmap[resdf$coef] resdf$coef = factor(resdf$coef, levels=resdf$coef) sigdf = resdf[resdf$padj < 0.05,] regpref = paste0(imgpref, 'reg_effectsizes_cohort_', cohort, '_') title = paste('Cohort:', cohort) if (only.exc){ regpref = paste0(regpref, 'exconly_') title = paste(title, '(Exc. only)') } # Plot effect sizes as error bars gp = coefErrorbarPlot(resdf, title=title) saveGGplot(gp, paste0(regpref, 'errorbar'), w=5, h=3) resdf$cohort = cohort resdf$set = ifelse(cellset == 'Exc', 'Exc. neurons', ifelse(cellset == 'All', 'All cells', 'No Undefined cells')) fulldf = rbind(fulldf, resdf) } } # Plot all of these effects jointly: # ---------------------------------- labdf = fulldf[fulldf$padj < 0.05,] labdf$lab = paste0('p=', sprintf('%0.1e', labdf$padj)) gp = ggplot(fulldf, aes(coef, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + facet_grid(set ~ cohort) + geom_point() + geom_errorbar(width=.2) + geom_hline(yintercept=0, lty='dotted') + geom_text(data=labdf, aes(coef, y=0.4, label=lab), color='black') + scale_color_manual(values=devals, name='Significance:') + theme_pubr() + coord_flip() + labs(x='Condition', y='Estimated effect size') regpref = paste0(imgpref, 'reg_effectsizes_bycohort_results_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=4.5) gp = ggplot(fulldf[fulldf$cohort == 'ALZ',], aes(coef, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + facet_grid(. ~ set) + geom_point() + geom_errorbar(width=.2) + geom_hline(yintercept=0, lty='dotted') + geom_text(data=labdf[labdf$cohort == 'ALZ',], aes(coef, y=0, label=lab), color='black') + scale_color_manual(values=devals, name='Significance:') + theme_pubr() + coord_flip() + labs(x='Condition', y='Estimated effect size') regpref = paste0(imgpref, 'reg_effectsizes_originalcohort_results_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=6, h=2.5) gp = ggplot(fulldf[fulldf$cohort == 'Joint',], aes(coef, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + facet_grid(. ~ set) + geom_point() + geom_errorbar(width=.2) + geom_hline(yintercept=0, lty='dotted') + scale_color_manual(values=devals, name='Significance:') + theme_pubr() + coord_flip() + labs(x='Condition', y='Estimated effect size') regpref = paste0(imgpref, 'reg_effectsizes_joint_results_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=5, h=2.5) # Regression estimates for everything in the metadata: # ---------------------------------------------------- # Load in the full metadata: ext.meta = read.delim('Annotation/metadata_PFC_all_individuals_092520.tsv', header=T) cn = colnames(ext.meta) cn = cn[!(cn %in% c('projid','scaled_to', 'agelast','agefirst'))] rownames(ext.meta) = as.character(ext.meta$projid) ext.ctdf = merge(ctdf[,c('projid', 'relabel','n_counts', 'percent_mito','percent_ribo','count_per_gene', 'n_genes','count','Apoe_e4','cohort','platform')], ext.meta) # Standardize variables: for (var in cn){ if (is.numeric(ext.ctdf[[var]])){ varz = paste0(var, '_z') ext.ctdf[[varz]] = scale(ext.ctdf[[var]]) } else { varz = var } } sub.ext.ctdf = ext.ctdf[ext.ctdf$n_counts >= countcutoff,] sub.ext.ctdf = sub.ext.ctdf[sub.ext.ctdf$cohort == 'ALZ',] # Restrict to used cohort # count.covar = '+ n_counts + n_genes' # count.covar = '+ platform * n_counts + platform* n_genes' # orig count.covar = '+ log(n_genes) + count_per_gene' # improved model # Run each of the regressions: only.exc = FALSE use.z = TRUE for (only.exc in c(TRUE, FALSE)){ for (use.z in c(TRUE, FALSE)){ if (only.exc){ df = sub.ext.ctdf[sub.ext.ctdf$relabel == 'Excitatory',] } else { df = sub.ext.ctdf } resdf = c() for (var in cn){ # Standardized variable: x = df[[var]] x = x[!is.na(x)] if (length(unique(x)) > 1){ if (is.numeric(ext.ctdf[[var]])){ varz = paste0(var, '_z') } else { varz = var } # n_counts better models than log(n_counts) if (use.z){ form.vec = c(reg.var, ' ~',varz) } else { form.vec = c(reg.var, ' ~',var) } # form.vec = c(form.vec, count.covar, ext.covar) form.vec = c(form.vec, ' + offset(log(n_counts)) ', count.covar, ext.covar) if (!only.exc){ form.vec = c(form.vec, '+ relabel') } rm(fit) form = asform(c(form.vec)) if (reg.var %in% c('capped_count','count')){ # fit = glm(form, df, family='poisson') # Original model fit = glm.nb(form, df) } else if (reg.var %in% c('has_fusion')){ fit = glm(form, df, family='binomial') } else { fit = glm(form, df, family='gaussian') } # fit = try(lm(asform(form.vec), df)) if (class(fit)[1] != 'try-error'){ cfit = processLmFit(fit) cfit$var = var cat(var, '\t', rownames(cfit)[2], '\n') resdf = rbind(resdf, cfit[2,]) } } } resdf = resdf[order(resdf$p),] resdf$padj = p.adjust(resdf$p, method='fdr') resdf$col = with(resdf, ifelse(padj < 0.05, ifelse(Est < 0, 'Down', 'Up'), 'NS')) resdf = resdf[order(resdf$Est),] resdf$coef = factor(resdf$coef, levels=resdf$coef) sigdf = resdf[resdf$padj < 0.05,] regpref = paste0(imgpref, 'reg_effectsizes_full_') title = paste('All variables') if (only.exc){ regpref = paste0(regpref, 'exconly_') title = paste(title, '(Exc. only)') } if (use.z){ regpref = paste0(regpref, 'zscoredvars_') title = paste(title, 'z-scored') } # Plot effect sizes as error bars gp = coefErrorbarPlot(resdf, title=title) saveGGplot(gp, paste0(regpref, 'errorbar'), w=8, h=15) # Plot effect sizes as error bars (significant only) gp = coefErrorbarPlot(sigdf, title=title) saveGGplot(gp, paste0(regpref, 'errorbar_sig'), w=8, h=10) } } # Test in each subset of fusion types separately: # ----------------------------------------------- # fs.subset = 'CLOSE' fulldf = c() for (fs.subset in c('INTRA','INTER','CLOSE')){ only.exc = FALSE if (fs.subset == 'CLOSE'){ df = fsdf[(fsdf$fs.set == 'INTRA') & (fsdf$Distance < 5e6),] } else { df = fsdf[fsdf$fs.set == fs.subset,] } nevents = nrow(df) fs.ctdf = agg.rename(annots ~ plate + sample, df, length, 'subset.count') fs.ctdf = merge(sub.ctdf, fs.ctdf, all.x=TRUE) fs.ctdf$subset.count[is.na(fs.ctdf$subset.count)] = 0 fs.ctdf = fs.ctdf[fs.ctdf$cohort == 'ALZ',] # Restrict to used cohort resdf = c() if (only.exc){ df = fs.ctdf[fs.ctdf$relabel == 'Excitatory',] } else { df = fs.ctdf } # count.covar = '+ platform * n_counts + platform* n_genes + platform * count_per_gene' # Orig count.covar = ' + log(n_genes) + count_per_gene' for (var in test.vars){ # form.vec = c('subset.count ~ ', var, count.covar, ext.covar) form.vec = c('subset.count ~ offset(log(n_counts)) + ', var, count.covar, ext.covar) if (!only.exc){ form.vec = c(form.vec, ' + relabel') } form = asform(c(form.vec)) if (reg.var %in% c('capped_count','count')){ # fit = glm(form, df, family='poisson') # Original model fit = glm.nb(form, df) } else if (reg.var %in% c('has_fusion')){ fit = glm(form, df, family='binomial') } else { fit = glm(form, df, family='gaussian') } cfit = processLmFit(fit) cfit$var = var resdf = rbind(resdf, cfit[2,]) } resdf = resdf[order(resdf$p),] resdf$padj = p.adjust(resdf$p, method='fdr') resdf$col = with(resdf, ifelse(padj < 0.05, ifelse(Est < 0, 'Down', 'Up'), 'NS')) resdf$coef = varmap[resdf$coef] resdf$coef = factor(resdf$coef, levels=resdf$coef) resdf$nevents = nevents # Plot prefixes: regpref = paste0(imgpref, 'reg_effectsizes_core_type_', fs.subset, '_') title = paste(fs.subset, 'fusions') if (only.exc){ regpref = paste0(regpref, 'exconly_') title = paste(title, '(Exc. only)') } # Plot effect sizes as error bars gp = coefErrorbarPlot(resdf, title=title) saveGGplot(gp, paste0(regpref, 'errorbar'), w=5, h=2.5) resdf$subset = fs.subset fulldf = rbind(fulldf, resdf) } # Plot these fusion statistics by subset, jointly: # ------------------------------------------------ fulldf$set = paste0(fulldf$subset, ' (', fulldf$nevents, ' hits)') labdf = fulldf[fulldf$padj < 0.05,] labdf$lab = paste0('p=', sprintf('%0.1e', labdf$padj)) gp = ggplot(fulldf, aes(coef, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + facet_grid(. ~ set) + geom_point() + geom_errorbar(width=.2) + geom_hline(yintercept=0, lty='dotted') + geom_text(data=labdf, aes(coef, y=0, label=lab), color='black') + scale_color_manual(values=devals, name='Significance:') + theme_pubr() + coord_flip() + labs(x='Condition', y='Estimated effect size') regpref = paste0(imgpref, 'reg_effectsizes_bysubtype_results_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=9, h=2.5) gp = ggplot(fulldf[fulldf$cohort == 'Joint',], aes(coef, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + facet_grid(. ~ set) + geom_point() + geom_errorbar(width=.2) + geom_hline(yintercept=0, lty='dotted') + scale_color_manual(values=devals, name='Significance:') + theme_pubr() + coord_flip() + labs(x='Condition', y='Estimated effect size') regpref = paste0(imgpref, 'reg_effectsizes_joint_results_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=5, h=2.5)