#!/usr/bin/R # ------------------------------------------------ # Estimate effect sizes for gene candidate fusions # --follows P301S, updated for human # Updated: 03/13/23 # ------------------------------------------------ library(cbrbase) set_proj('DEVTRAJ', 'mosaicism') library(tidyr) library(MASS) library(ggplot2) library(ggpubr) library(scales) library(RColorBrewer) library(Matrix) options(width=170) source(paste0(bindir, 'mosaicism/fusions_AD430/load_metadata.R')) source(paste0(bindir, 'multiRegion/auxiliary_plotting_settings.R')) # Directories: plotdir = paste0(imgdir, 'fusions/') imgpref = paste0(plotdir, 'AD430_fusions_') cmd = paste('mkdir -p', plotdir) system(cmd) # Load in the gene fusion data: # ----------------------------- source(paste0(bindir, 'mosaicism/fusions_AD430/load_fusioncalls.R')) source(paste0(bindir, 'mosaicism/genefusions/auxiliary_fusion_plotting.R')) sub.ctdf = sub.ctdf[grep("^SM_171013", sub.ctdf$dataset, invert=TRUE),] # Remove batch (v2) # Load the pre-computed signature scores: # --------------------------------------- scorepref = paste0(dbdir, 'fusions_AD430/fusions_AD430_signature_scores') score.tsv = paste0(scorepref, '.tsv.gz') proc.tsv = paste0(scorepref, '.processed.tsv.gz') proc.rds = paste0(scorepref, '.processed.Rds') if (!file.exists(proc.rds)){ scoredf = read.delim(gzfile(scorefile), header=TRUE, check.names=FALSE) kwds = names(scoredf)[4:ncol(scoredf)] marg = scoredf$marg # Convert the barcode format scoredf$tail = sub('.*-', '', scoredf$obsnames) scoredf$core = sub('SM_', '', sub("-.*","", scoredf$obsnames)) scoredf$cb = sub(".*_","", scoredf$core) scoredf$batch = paste0("SM_", sub("_.*","", scoredf$core)) scoredf$barcode = paste0(scoredf$batch, '-', scoredf$tail, ':', scoredf$cb) # mean(sub.ctdf$barcode %in% scoredf$barcode) # simple check that all barcodes have scores # Make and normalize matrix: scoremat = scoredf[,kwds] rownames(scoremat) = scoredf$barcode # Normalize scores in same way as for smart-seq data: norm = log1p(sweep(scoremat, 1, marg / 10000, '/')) # Aggregate as long dataframe: ndf = data.frame(norm, check.names=FALSE) ndf$barcode = rownames(ndf) enrdf = gather(ndf, tag, enr, -barcode) # Save these processed scores: saveRDS(enrdf, file=proc.rds) write.table(enrdf, gzfile(proc.tsv), quote=F, sep="\t", row.names=F) } else { enrdf = readRDS(proc.rds) } kwds = unique(enrdf$tag) # Regressions taking into account GO pathway status (esp. cohesin). # ----------------------------------------------------------------- # Signatures to use evaluate in regression: kwdstrs = gsub("[- +&]", "_", kwds) names(kwds) = kwdstrs varmap = c(varmap, kwds) # Get the zscore.cutoff = 1 for (kwd in kwds){ kwdstr = gsub("[- +&]", "_", kwd) subdf = enrdf[enrdf$tag == kwd, ] enrmap = subdf$enr names(enrmap) = subdf$barcode enr = scale(enrmap[sub.ctdf$barcode]) sub.ctdf[[kwdstr]] = ifelse(enr > zscore.cutoff, 1, 0) } regrpref = paste0(dbdir, 'fusions_AD430/fusions_AD430_signature_results') regr.tsv = paste0(regrpref, '.tsv') regr.rds = paste0(regrpref, '.Rds') if (!file.exists(regr.rds)){ fulldf = c() reg.var = 'count' for (only.exc in c(TRUE, FALSE)){ df = sub.ctdf if (only.exc){ df = df[df$celltype == 'Exc',] } resdf = c() for (var in kwdstrs){ form.vec = c(reg.var, '~ offset(log(n_counts)) +', var) form.vec = c(form.vec, count.covar, ext.covar) if (!only.exc){ form.vec = c(form.vec, ' + celltype') } form = asform(form.vec) if (reg.var %in% c('capped_count','count')){ fit = glm.nb(form, df) } else if (reg.var %in% c('has_fusion')){ fit = glm(form, df, family='binomial') } else { fit = glm(form, df, family='gaussian') } cfit = processLmFit(fit) cfit$var = var resdf = rbind(resdf, cfit[grep(var, cfit$coef),]) } resdf = resdf[order(resdf$p),] resdf$padj = p.adjust(resdf$p, method='fdr') resdf$col = with(resdf, ifelse(padj < 0.05, ifelse(Est < 0, 'Down', 'Up'), 'NS')) resdf = resdf[order(resdf$Est),] resdf$coef1 = resdf$coef resdf$coef = varmap[resdf$coef] resdf$coef = factor(resdf$coef, levels=resdf$coef) resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells') resdf$reg.var = reg.var fulldf = rbind(fulldf, resdf) } saveRDS(fulldf, file=regr.rds) write.table(fulldf, regr.tsv, quote=F, row.names=F, sep="\t") } else { fulldf = readRDS(regr.rds) } fulldf = fulldf[order(fulldf$p),] # Plot all of these effects jointly: # ---------------------------------- subkwds = kwds[grep("Stage", kwds, invert=TRUE)] subkwds = subkwds[grep("immune_signature", subkwds, invert=TRUE)] # subkwds subkwdstrs = names(kwds[kwds %in% subkwds]) kwddf = fulldf[fulldf$coef1 %in% subkwdstrs,] aggdf = aggregate(Est ~ coef1, kwddf, mean) aggdf = aggdf[order(aggdf$Est),] kwddf$coef1 = factor(kwddf$coef1, levels=aggdf$coef1) kwddf$lab = paste0('p=', sprintf('%0.1e', kwddf$padj)) # kwddf = kwddf[order(kwddf$Est),] labdf = kwddf[kwddf$padj < 0.05,] ypos = 0.2 gp = ggplot(kwddf, aes(coef, Est, ymin=Est-2*SE, ymax=Est+2*SE, color=factor(col))) + facet_grid(. ~ set) + geom_point() + geom_errorbar(width=.2) + geom_hline(yintercept=0, lty='dotted') + geom_text(data=labdf, aes(coef, y=ypos, label=lab), color='black') + scale_color_manual(values=devals, name='Significance:') + theme_pubr() + coord_flip() + labs(x='Condition', y='Estimated effect size') regpref = paste0(imgpref, 'reg_effectsizes_expression_sigs_only_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=9, h=4.5) # # Scatterplots, not very clear to read / need covar correction. ctlong = gather(sub.ctdf[,c('count', 'n_genes', 'n_counts', kwdstrs, 'celltype')], type, value, -count, -n_genes, -n_counts, -celltype) # gplot = ggplot(ctlong, aes(factor(value), count)) + # , color=platform)) + # facet_grid(celltype ~ type) + # # scale_fill_manual(values=c('grey80', 'grey50'), name='Has event') + # geom_boxplot(cex=.5) + # stat_compare_means() + # labs(x=paste('Type of putative fusion event'), # y='Norm. signature expression, log(X+1)', # title='GO signatures in Excitatory/Stage 2 cells, stratified by presence of fusion event') + # theme_pubr() + theme() # Plot terms vs. rate: # -------------------- kwd = 'double-strand break repair' # kwd = 'senescence' kwdstr = gsub("[- +&]", "_", kwd) subdf = ctlong[ctlong$type == kwdstr,] subdf$count2 = subdf$count subdf$count2[subdf$count2 > 10] = 10 subdf$rate = subdf$count / subdf$n_counts subdf$capped_rate = subdf$count2 / subdf$n_counts gp = ggplot(subdf, aes(celltype, capped_rate, fill=factor(value))) + # facet_wrap(. ~ platform) + geom_boxplot(outlier.size=.5) + scale_fill_manual(values=c('grey80', 'slateblue3'), name=paste0('z(', kwdstr, ') > 1')) + stat_compare_means(label='p.signif') + labs(x='Cell type', y='Rate (# fusions / # genes)') + coord_flip() + theme_pubr() regpref = paste0(imgpref, 'boxplot_z1_', kwdstr) saveGGplot(gp, regpref, w=8, h=3.75) # Plot quantitative score vs. rate: # --------------------------------- # subdf = enrdf[enrdf$tag == kwd, c('enr','barcode')] # subdf = merge(subdf, sub.ctdf) subdf = enrdf[enrdf$tag == kwd, ] enrmap = subdf$enr names(enrmap) = subdf$barcode subdf = sub.ctdf subdf$score = enrmap[subdf$barcode] subdf = subdf[subdf$celltype == 'Exc',] subdf$count2 = subdf$count subdf$count2[subdf$count2 > 10] = 10 subdf$rate = subdf$count / subdf$n_counts subdf$capped_rate = subdf$count2 / subdf$n_counts subdf$binned = cut(subdf$score, quantile(subdf$score, c(0, .25, .5, .75, 1)), include.lowest=TRUE) gp = ggplot(subdf, aes(score, capped_rate)) + geom_point() + geom_smooth(method='lm') + stat_cor() + labs(x=paste0('Expr. (', kwdstr, ')'), y='Rate (# fusions / # genes)') + theme_pubr() gp = ggplot(subdf, aes(binned, capped_rate)) + facet_wrap(. ~ platform) + geom_boxplot() + # geom_point() + geom_smooth(method='lm') + stat_cor() + labs(x=paste0('Expr. (', kwdstr, ')'), y='Rate (# fusions / # genes)') + theme_pubr() gp = ggplot(subdf, aes_string('score', 'capped_rate', color=kwdstr)) + facet_wrap(. ~ relabel) + geom_point() + geom_smooth(method='lm') + stat_cor() + labs(x=paste0('Expr. (', kwdstr, ')'), y='Rate (# fusions / # genes)') + theme_pubr() # Genes differentially expressed by fusion stratification: # -------------------------------------------------------- pct.expr = rowSums(fullmat > 0) / ncol(fullmat) ensgs = names(pct.expr)[pct.expr > 0.05] # Genes expr above 5% only nmat = sweep(fullmat[ensgs,], 2, marg[colnames(fullmat)] / 10000, '/') nmat = log1p(nmat) fusion.degs.rds = paste0(dbdir, 'fusion_degs_joint_cohort.Rds') fusion.degs.tsv = paste0(dbdir, 'fusion_degs_joint_cohort.tsv') if (!file.exists(fusion.degs.rds)){ fulldf = c() for (only.exc in c(TRUE, FALSE)){ if (only.exc){ df = sub.ctdf[sub.ctdf$relabel == 'Excitatory',] } else { df = sub.ctdf } resdf = c() for (i in 1:length(ensgs)){ gene = ensgs[i] cat(i, gene, '\n') if (i %% 500 == 0){ print(i) } df$val = nmat[gene, df$barcode] # Model number of candidate fusions by expression value form.vec = c('count ~ val', count.covar, ext.covar) if (!only.exc){ form.vec = c(form.vec, ' + relabel') } form = asform(form.vec) fit = lm(form, df) cfit = processLmFit(fit) cfit$ENSG = gene resdf = rbind(resdf, cfit['val',]) } resdf = resdf[order(resdf$p),] resdf$padj = p.adjust(resdf$p, method='fdr') resdf$col = with(resdf, ifelse(padj < 0.05, ifelse(Est < 0, 'Down', 'Up'), 'NS')) resdf = resdf[order(resdf$Est),] resdf$coef1 = resdf$coef resdf$coef = varmap[resdf$coef] resdf$coef = factor(resdf$coef, levels=resdf$coef) resdf$cohort = 'Joint' resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells') fulldf = rbind(fulldf, resdf) } fulldf = merge(fulldf, anno[,c('ENSG','symbol')]) fulldf = fulldf[order(fulldf$p),] saveRDS(fulldf, fusion.degs.rds) write.table(fulldf, fusion.degs.tsv, quote=F, sep="\t", row.names=F) } else { fulldf = readRDS(fusion.degs.rds) } # Plot results as a volcano plot: # -------------------------------- # TODO: Add errorbars? setmap = c('all'='All cells', 'exc'='Exc. neurons') for (set in c('all','exc')){ subdf = fulldf[fulldf$set == setmap[set],] subdf = merge(subdf, anno[anno$type == 'protein_coding',]) labdf = subdf[subdf$col != 'NS',] gplot = ggplot(subdf, aes(Est, -log10(p), color=col)) + scale_color_manual(values=devals, name='Signif.') + geom_vline(xintercept=0, lty='dashed') + geom_point(cex=.5) + geom_text_repel(data=labdf, aes(Est, -log10(p), label=symbol, color=col), max.overlaps=20) + scale_y_continuous(expand=c(0,0)) + labs(x='Estimated effect', y='-log10 p-value', title=paste0('Genes predictive of increased candidate fusions (', setmap[set],')')) + theme_pubr() + theme() pltpref = paste0(imgpref, 'volcano_degs_fusions_', set) ggsave(paste0(pltpref, '.png'), gplot, dpi=450, units='in', width=7, height=6) ggsave(paste0(pltpref, '.pdf'), gplot, dpi=450, units='in', width=7, height=6) }