#!/usr/bin/R # ------------------------------------------------ # Estimate effect sizes for gene candidate fusions, # stratifying by specific GO terms + cell types # TODO: update to use NB # Updated: 08/07/22 # ------------------------------------------------ library(cbrbase) set_proj('DEVTRAJ') # Libraries: library(tidyr) library(MASS) library(ggplot2) library(ggpubr) library(ggrepel) library(ggbeeswarm) library(scales) library(RColorBrewer) library(Matrix) options(width=170) # Load in the two cohorts of candidate gene fusion counts: # -------------------------------------------------------- gbdir = paste0(bindir, 'mosaicism/genefusions/') source(paste0(gbdir, 'load_fusion_data.R')) source(paste0(gbdir, 'auxiliary_fusion_plotting.R')) source(paste0(gbdir, 'load_go_signatures.R')) source(paste0(gbdir, 'load_joint_expression.R')) source(paste0(bindir, 'multiRegion/auxiliary_plotting_settings.R')) # Make directories for analysis: # ------------------------------ analysis = 'fusion' project = 'ALZ' imgdir = paste0(img, project, '/', analysis , '/') imgpref = paste0(imgdir, analysis, "_mspred_strat_", project, '_') cmd = paste0('mkdir -p ', imgdir) system(cmd) # Pull out relevant annotations: # ------------------------------ filepref = paste0(dbdir, 'joint_cohort_annotations_scores') enr.rds = paste0(filepref, '_broad.Rds') fullenr.rds = paste0(filepref, '_specific.Rds') enrdf = readRDS(enr.rds) full.enrdf = readRDS(fullenr.rds) # Regressions taking into account GO pathway status (esp. cohesin). # ----------------------------------------------------------------- kwds = names(siglist) kwdstrs = gsub("[- +&]", "_", kwds) for (kwd in kwds){ kwdstr = gsub("[- +&]", "_", kwd) subdf = enrdf[enrdf$tag == kwd, ] enrmap = subdf$enr names(enrmap) = subdf$sample enr = scale(enrmap[sub.ctdf$sample]) sub.ctdf[[kwdstr]] = ifelse(enr > 1, 1, 0) } names(kwds) = kwdstrs varmap = c(varmap, kwds) fulldf = c() reg.var = 'count' celltypes = unique(sub.ctdf$relabel) for (label in celltypes){ for (cohort in c('ALZ','ALZ_repl', 'Joint')){ if (cohort == 'Joint'){ coh.ctdf = sub.ctdf } else { coh.ctdf = sub.ctdf[sub.ctdf$cohort == cohort,] } if (length(unique(coh.ctdf$platform)) > 1){ count.covar = '+ platform * (log(n_genes) + count_per_gene)' } else { count.covar = '+ log(n_genes) + count_per_gene' } df = coh.ctdf[coh.ctdf$relabel == label,] resdf = c() for (var in c(test.vars, kwdstrs)){ form.vec = c(reg.var, '~ offset(log(n_counts)) + ', var) form.vec = c(form.vec, count.covar, ext.covar) form = asform(form.vec) if (reg.var %in% c('capped_count','count')){ # fit = glm(form, df, family='poisson') fit = glm.nb(form, df) } else if (reg.var %in% c('has_fusion')){ fit = glm(form, df, family='binomial') } else { fit = glm(form, df, family='gaussian') } cfit = processLmFit(fit) cfit$var = var resdf = rbind(resdf, cfit[grep(var, cfit$coef),]) } resdf = resdf[order(resdf$p),] resdf$padj = p.adjust(resdf$p, method='fdr') resdf$col = with(resdf, ifelse(padj < 0.05, ifelse(Est < 0, 'Down', 'Up'), 'NS')) resdf = resdf[order(resdf$Est),] resdf$coef1 = resdf$coef resdf$coef = varmap[resdf$coef] resdf$coef = factor(resdf$coef, levels=resdf$coef) resdf$cohort = cohort resdf$relabel = label fulldf = rbind(fulldf, resdf) } } resdf = resdf[order(resdf$p),] fulldf = fulldf[order(fulldf$p),] # Plot effects for the original cohort only: # ------------------------------------------ subkwdstrs = names(kwds[kwds %in% subkwds]) kwddf = fulldf[fulldf$coef1 %in% subkwdstrs,] aggdf = aggregate(Est ~ coef1, kwddf, mean) aggdf = aggdf[order(aggdf$Est),] kwddf$coef1 = factor(kwddf$coef1, levels=aggdf$coef1) sigkwds = subkwdstrs[5:length(subkwdstrs)] # Keep only GO sigs sigkwddf = kwddf[(kwddf$cohort == 'ALZ') & (kwddf$coef1 %in% sigkwds),] labdf = sigkwddf[sigkwddf$padj < 0.05,] labdf$lab = paste0('p=', sprintf('%0.1e', labdf$padj)) gp = ggplot(sigkwddf, aes(coef1, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + facet_grid(. ~ relabel) + geom_hline(yintercept=0, lty='dotted') + geom_point() + geom_errorbar(width=.2) + scale_color_manual(values=devals, name='Significance:') + geom_text(data=labdf, aes(coef1, y=0.4, label=lab), color='black') + theme_pubr() + coord_flip() + labs(x='Condition', y='Estimated effect size') regpref = paste0(imgpref, 'reg_effectsizes_expression_sigs_celltypes_only_originalcohort_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=2.5) # Plot as a heatmap: # ------------------ # Save as heatmap: cmat = pivot.tomatrix(sigkwddf[,c('Est','coef','relabel')], 'relabel','Est') pmat = pivot.tomatrix(sigkwddf[,c('padj','coef','relabel')], 'relabel','padj') pmat[is.na(pmat)] = 1 mx = 1 mat.col_fun = colorRamp2(c(-mx, 0, mx), c("blue", "white", "red")) ux = 1.5 ht = Heatmap(cmat, name='Est', col=mat.col_fun, width = ncol(cmat)*unit(ux, "mm"), height = nrow(cmat)*unit(ux, "mm"), border_gp=gpar(color='black', lwd=.5), cell_fun = function(j, i, x, y, w, h, col){ if (pmat[i,j] < 0.05){ grid.text('*', x, y, gp=gpar(fontsize=gridtxt.fs)) } } ) h = 2 + 1 / 15 * nrow(cmat) w = 2 + 1 / 15 * ncol(cmat) pltpref = paste0(imgpref, 'reg_effectsizes_expression_sigs_CellType_heatmap') saveHeatmap(ht, pltpref, h=h, w=w)