#!/usr/bin/R # ------------------------------------------------ # Estimate effect sizes for gene candidate fusions, # stratifying by specific GO terms # Updated: 02/21/23 to update regression model # ------------------------------------------------ library(cbrbase) set_proj('DEVTRAJ') # Libraries: library(tidyr) library(MASS) library(ggplot2) library(ggpubr) library(ggrepel) library(ggbeeswarm) library(scales) library(RColorBrewer) library(Matrix) options(width=170) # Load in the two cohorts of candidate gene fusion counts: # -------------------------------------------------------- gbdir = paste0(bindir, 'mosaicism/genefusions/') source(paste0(gbdir, 'load_fusion_data.R')) source(paste0(gbdir, 'auxiliary_fusion_plotting.R')) source(paste0(gbdir, 'load_go_signatures.R')) source(paste0(gbdir, 'load_joint_expression.R')) source(paste0(bindir, 'multiRegion/auxiliary_plotting_settings.R')) f2df = merge(fsdf, unique(meta[,c('sample','relabel')]), all.x=TRUE) f2df = f2df[f2df$cohort == 'ALZ',] write.table(f2df, 'ALZ_only_celltype_fusions_maxsensitivity.tsv', quote=F, sep="\t", row.names=TRUE) # Make directories for analysis: # ------------------------------ analysis = 'fusion' project = 'ALZ' imgdir = paste0(img, project, '/', analysis , '/') imgpref = paste0(imgdir, analysis, "_mspred_strat_", project, '_') cmd = paste0('mkdir -p ', imgdir) system(cmd) # Pull out relevant annotations: # ------------------------------ # TODO: fullmat vs. fullmati (intronic counts) marg = colSums(fullmat) margi = colSums(fullmati) rel.pct = margi / (margi + marg) meta$rel.int = rel.pct[as.character(meta$sample)] # Relative intronic kwds = names(siglist) filepref = paste0(dbdir, 'joint_cohort_annotations_scores') enr.rds = paste0(filepref, '_broad.Rds') enr.tsv = paste0(filepref, '_broad.tsv.gz') fullenr.rds = paste0(filepref, '_specific.Rds') if (!file.exists(enr.rds)){ enrdf = c() full.enrdf = c() for (kwd in kwds){ genes = siglist[[kwd]] ensgs = anno$ENSG[anno$symbol %in% genes] ensgs = ensgs[ensgs %in% rownames(fullmat)] # Overall score for all genes involved: ngenes = length(ensgs) cs = log1p(10000.0 * colSums(fullmat[ensgs,]) / marg) enrdf = rbind(enrdf, data.frame(sample=names(cs), enr=cs, tag=kwd, ng=ngenes)) if (kwd == 'double-strand break repair'){ lset = full.siglist[[kwd]] for (term in names(lset)){ ensgs = anno$ENSG[anno$symbol %in% lset[[term]]] ensgs = ensgs[ensgs %in% rownames(fullmat)] ngenes = length(ensgs) cs = log1p(10000.0 * colSums(fullmat[ensgs,]) / marg) full.enrdf = rbind(full.enrdf, data.frame(sample=names(cs), enr=cs, tag=kwd, term=term)) } } } enrdf = merge(enrdf, meta) full.enrdf = merge(full.enrdf, meta) # Save terms: saveRDS(enrdf, file=enr.rds) write.table(enrdf, gzfile(enr.tsv), quote=F, row.names=F, sep="\t") saveRDS(full.enrdf, file=fullenr.rds) } else { enrdf = readRDS(enr.rds) full.enrdf = readRDS(fullenr.rds) } # Plot relative enrichment of different subtypes: # ----------------------------------------------- gplot = ggplot(enrdf, aes(relabel, enr, color=relabel, fill=relabel)) + facet_wrap(~tag, nrow=2, scale='free_y') + scale_color_manual(values=relabel.col) + scale_fill_manual(values=relabel.col) + geom_violin(fill=NA) + geom_quasirandom(varwidth=TRUE, size=.3) + labs(x=paste('Sub-celltype'), y='Norm. signature expression, log(X+1)') + theme_pubr() + theme(legend.position='none') + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=.5)) pltpref = paste0(imgpref, 'signatures_terms_expressionlevels_violin') ggsave(paste0(pltpref, '.png'), gplot, dpi=450, units='in', width=11, height=6) ggsave(paste0(pltpref, '.pdf'), gplot, dpi=450, units='in', width=11, height=6) # Merge at the individual level and plot individual-level boxplots: # ----------------------------------------------------------------- enrdf$short.redsc.coh = paste0(enrdf$cohort, '-', enrdf$short.redsc) aggdf = aggregate(enr ~ projid + short.redsc + short.redsc.coh + relabel + tag, enrdf, mean) gplot = ggplot(aggdf, aes(short.redsc.coh, enr, fill=relabel)) + facet_wrap(~tag, nrow=3, scale='free_y') + scale_fill_manual(values=relabel.col) + geom_boxplot(outlier.shape=NA, alpha=1, lwd=.5) + geom_jitter(position=position_jitterdodge(jitter.width=.15, dodge.width=.75), cex=.25) + labs(x=paste('Sub-celltype'), y='Norm. signature expression, log(X+1)') + theme_pubr() + theme(legend.position='none') + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=.5)) pltpref = paste0(imgpref, 'signatures_terms_expressionlevels_indiv_violin') ggsave(paste0(pltpref, '.png'), gplot, dpi=450, units='in', width=16, height=10) ggsave(paste0(pltpref, '.pdf'), gplot, dpi=450, units='in', width=16, height=10) # Regressions taking into account GO pathway status (esp. cohesin). # ----------------------------------------------------------------- kwdstrs = gsub("[- +&]", "_", kwds) for (kwd in kwds){ kwdstr = gsub("[- +&]", "_", kwd) subdf = enrdf[enrdf$tag == kwd, ] enrmap = subdf$enr names(enrmap) = subdf$sample enr = scale(enrmap[sub.ctdf$sample]) sub.ctdf[[kwdstr]] = ifelse(enr > 1, 1, 0) } sub.ctdf$rel.int = rel.pct[sub.ctdf$sample] names(kwds) = kwdstrs varmap = c(varmap, kwds) fulldf = c() reg.var = 'count' for (only.exc in c(TRUE, FALSE)){ for (cohort in c('ALZ','ALZ_repl', 'Joint')){ if (cohort == 'Joint'){ coh.ctdf = sub.ctdf } else { coh.ctdf = sub.ctdf[sub.ctdf$cohort == cohort,] } if (length(unique(coh.ctdf$platform)) > 1){ # count.covar = '+ platform * (n_counts + n_genes + count_per_gene)' count.covar = '+ platform * (log(n_genes) + count_per_gene)' } else { # count.covar = '+ n_counts + n_genes + count_per_gene' count.covar = '+ log(n_genes) + count_per_gene' } if (only.exc){ df = coh.ctdf[coh.ctdf$relabel == 'Excitatory',] } else { df = coh.ctdf } resdf = c() for (var in kwdstrs){ # form.vec = c(reg.var, '~ ', var, count.covar, ext.covar) # Original coh.covar = '+ pmi_rescaled + msex + age_rescaled' form.vec = c(reg.var, '~ offset(log(n_counts)) +', var, count.covar, ext.covar) if (!only.exc){ form.vec = c(form.vec, ' + relabel') } form = asform(form.vec) if (reg.var %in% c('capped_count','count')){ # fit = glm(form, df, family='poisson') # Original model fit = glm.nb(form, df) } else if (reg.var %in% c('has_fusion')){ fit = glm(form, df, family='binomial') } else { fit = glm(form, df, family='gaussian') } cfit = processLmFit(fit) cfit$var = var resdf = rbind(resdf, cfit[grep(var, cfit$coef),]) } resdf = resdf[order(resdf$p),] resdf$padj = p.adjust(resdf$p, method='fdr') resdf$col = with(resdf, ifelse(padj < 0.05, ifelse(Est < 0, 'Down', 'Up'), 'NS')) resdf = resdf[order(resdf$Est),] resdf$coef1 = resdf$coef resdf$coef = varmap[resdf$coef] resdf$coef = factor(resdf$coef, levels=resdf$coef) resdf$cohort = cohort resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells') fulldf = rbind(fulldf, resdf) } } resdf = resdf[order(resdf$p),] fulldf = fulldf[order(fulldf$p),] # Plot all of these effects jointly: # ---------------------------------- subkwdstrs = names(kwds[kwds %in% c('cohesin', subkwds)]) kwddf = fulldf[fulldf$coef1 %in% subkwdstrs,] aggdf = aggregate(Est ~ coef1, kwddf, mean) aggdf = aggdf[order(aggdf$Est),] kwddf$coef1 = factor(kwddf$coef1, levels=aggdf$coef1) gp = ggplot(kwddf, aes(coef1, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + facet_grid(set ~ cohort) + geom_point() + geom_errorbar(width=.2) + geom_hline(yintercept=0, lty='dotted') + scale_color_manual(values=devals, name='Significance:') + theme_pubr() + coord_flip() + labs(x='Condition', y='Estimated effect size') regpref = paste0(imgpref, 'reg_effectsizes_expression_sigs_only_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=9, h=4.5) # Plot effects for the original cohort only: # ------------------------------------------ sigkwds = subkwdstrs[5:length(subkwdstrs)] # Keep only GO sigs sigkwddf = kwddf[(kwddf$cohort == 'ALZ') & (kwddf$coef1 %in% sigkwds),] labdf = sigkwddf[sigkwddf$padj < 0.05,] labdf$lab = paste0('p=', sprintf('%0.1e', labdf$padj)) gp = ggplot(sigkwddf, aes(coef1, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + facet_grid(. ~ set ) + geom_hline(yintercept=0, lty='dotted') + geom_point() + geom_errorbar(width=.2) + scale_color_manual(values=devals, name='Significance:') + geom_text(data=labdf, aes(coef1, y=0.4, label=lab), color='black') + theme_pubr() + coord_flip() + labs(x='Condition', y='Estimated effect size') regpref = paste0(imgpref, 'reg_effectsizes_expression_sigs_only_originalcohort_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=5, h=2.5) # Joint only: gp = ggplot(kwddf[kwddf$cohort == 'Joint',], aes(coef1, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + facet_grid(. ~ set ) + geom_point() + geom_errorbar(width=.2) + geom_hline(yintercept=0, lty='dotted') + scale_color_manual(values=devals, name='Significance:') + theme_pubr() + coord_flip() + labs(x='Condition', y='Estimated effect size') regpref = paste0(imgpref, 'reg_effectsizes_expression_sigs_only_joint_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=5, h=2.5) # Scatterplots, not very clear to read / need covar correction. ctlong = gather(sub.ctdf[,c('count', 'n_genes', kwdstrs, 'relabel', 'platform')], type, value, -count, -relabel, -platform, -n_genes) gplot = ggplot(ctlong, aes(count, value, color=platform)) + facet_grid(relabel ~ type) + scale_fill_manual(values=c('grey80', 'grey50'), name='Has event') + geom_point(cex=.5) + geom_smooth(method='lm') + stat_cor() + labs(x=paste('Type of putative fusion event'), y='Norm. signature expression, log(X+1)', title='GO signatures in Excitatory/Stage 2 cells, stratified by presence of fusion event') + theme_pubr() + theme() # Plot terms vs. rate: # -------------------- kwd = 'double-strand break repair' # kwd = 'senescence' kwdstr = gsub("[- +&]", "_", kwd) subdf = ctlong[ctlong$type == kwdstr,] subdf$count2 = subdf$count subdf$count2[subdf$count2 > 10] = 10 subdf$rate = subdf$count / subdf$n_genes subdf$capped_rate = subdf$count2 / subdf$n_genes subdf = subdf[subdf$platform %in% c('40SE'),] gp = ggplot(subdf, aes(relabel, capped_rate, fill=factor(value))) + facet_wrap(. ~ platform) + geom_boxplot(outlier.size=.5) + scale_fill_manual(values=c('grey80', 'slateblue3'), name=paste0('z(', kwdstr, ') > 1')) + stat_compare_means(label='p.signif') + labs(x='Cell type', y='Rate (# fusions / # genes)') + coord_flip() + theme_pubr() regpref = paste0(imgpref, 'boxplot_z1_', kwdstr) saveGGplot(gp, regpref, w=8, h=3.75) # Plot quantitative score vs. rate: # --------------------------------- subdf = enrdf[enrdf$tag == kwd, c('enr','sample')] subdf = merge(subdf, sub.ctdf) subdf = subdf[subdf$relabel == 'Excitatory',] subdf$count2 = subdf$count subdf$count2[subdf$count2 > 10] = 10 subdf$rate = subdf$count / subdf$n_genes subdf$capped_rate = subdf$count2 / subdf$n_genes subdf = subdf[subdf$platform %in% c('150PE','40SE'),] subdf$binned = cut(subdf$score, quantile(subdf$score, c(0, .25, .5, .75, 1)), include.lowest=TRUE) gp = ggplot(subdf, aes(score, capped_rate)) + facet_wrap(. ~ platform) + geom_point() + geom_smooth(method='lm') + stat_cor() + labs(x=paste0('Expr. (', kwdstr, ')'), y='Rate (# fusions / # genes)') + theme_pubr() gp = ggplot(subdf, aes(binned, capped_rate)) + facet_wrap(. ~ platform) + geom_boxplot() + # geom_point() + geom_smooth(method='lm') + stat_cor() + labs(x=paste0('Expr. (', kwdstr, ')'), y='Rate (# fusions / # genes)') + theme_pubr() gp = ggplot(subdf, aes_string('score', 'capped_rate', color=kwdstr)) + facet_wrap(. ~ relabel) + geom_point() + geom_smooth(method='lm') + stat_cor() + labs(x=paste0('Expr. (', kwdstr, ')'), y='Rate (# fusions / # genes)') + theme_pubr() # Genes differentially expressed by fusion stratification: # -------------------------------------------------------- pct.expr = rowSums(fullmat > 0) / ncol(fullmat) ensgs = names(pct.expr)[pct.expr > 0.05] # Genes expr above 5% only nmat = sweep(fullmat[ensgs,], 2, marg[colnames(fullmat)] / 10000, '/') nmat = log1p(nmat) fusion.degs.rds = paste0(dbdir, 'fusion_degs_joint_cohort.Rds') fusion.degs.tsv = paste0(dbdir, 'fusion_degs_joint_cohort.tsv') if (!file.exists(fusion.degs.rds)){ fulldf = c() for (only.exc in c(TRUE, FALSE)){ if (only.exc){ df = sub.ctdf[sub.ctdf$relabel == 'Excitatory',] } else { df = sub.ctdf } resdf = c() for (i in 1:length(ensgs)){ gene = ensgs[i] cat(i, gene, '\n') if (i %% 500 == 0){ print(i) } df$val = nmat[gene, df$sample] # Model number of candidate fusions by expression value form.vec = c('count ~ val', count.covar, ext.covar) if (!only.exc){ form.vec = c(form.vec, ' + relabel') } form = asform(form.vec) fit = lm(form, df) cfit = processLmFit(fit) cfit$ENSG = gene resdf = rbind(resdf, cfit['val',]) } resdf = resdf[order(resdf$p),] resdf$padj = p.adjust(resdf$p, method='fdr') resdf$col = with(resdf, ifelse(padj < 0.05, ifelse(Est < 0, 'Down', 'Up'), 'NS')) resdf = resdf[order(resdf$Est),] resdf$coef1 = resdf$coef resdf$coef = varmap[resdf$coef] resdf$coef = factor(resdf$coef, levels=resdf$coef) resdf$cohort = 'Joint' resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells') fulldf = rbind(fulldf, resdf) } fulldf = merge(fulldf, anno[,c('ENSG','symbol')]) fulldf = fulldf[order(fulldf$p),] saveRDS(fulldf, fusion.degs.rds) write.table(fulldf, fusion.degs.tsv, quote=F, sep="\t", row.names=F) } else { fulldf = readRDS(fusion.degs.rds) } # Plot results as a volcano plot: # -------------------------------- # TODO: Add errorbars? setmap = c('all'='All cells', 'exc'='Exc. neurons') for (set in c('all','exc')){ subdf = fulldf[fulldf$set == setmap[set],] subdf = merge(subdf, anno[anno$type == 'protein_coding',]) labdf = subdf[subdf$col != 'NS',] gplot = ggplot(subdf, aes(Est, -log10(p), color=col)) + scale_color_manual(values=devals, name='Signif.') + geom_vline(xintercept=0, lty='dashed') + geom_point(cex=.5) + geom_text_repel(data=labdf, aes(Est, -log10(p), label=symbol, color=col), max.overlaps=20) + scale_y_continuous(expand=c(0,0)) + labs(x='Estimated effect', y='-log10 p-value', title=paste0('Genes predictive of increased candidate fusions (', setmap[set],')')) + theme_pubr() + theme() pltpref = paste0(imgpref, 'volcano_degs_fusions_', set) ggsave(paste0(pltpref, '.png'), gplot, dpi=450, units='in', width=7, height=6) ggsave(paste0(pltpref, '.pdf'), gplot, dpi=450, units='in', width=7, height=6) }