#!/usr/bin/R # ------------------------------------------------ # Estimate effect sizes for gene candidate fusions # Updated: 02/21/23 to update regression model # ------------------------------------------------ library(cbrbase) set_proj('DEVTRAJ') # Libraries: library(tidyr) library(MASS) library(ggplot2) library(ggpubr) library(RColorBrewer) library(GenomicRanges) options(width=170) # Load in and process data (saves to matrices): # project = 'ALZ_repl' project = 'ALZ' commandArgs <- function(){ c(project)} source(paste0(bindir, 'mosaicism/load_metadata.R')) # Make directories for analysis: analysis = 'fusion' imgdir = paste0(img, project, '/', analysis , '/') imgpref = paste0(imgdir, analysis, "_mspred_", project, '_') cmd = paste0('mkdir -p ', imgdir) system(cmd) source(paste0(bindir, 'multiRegion/auxiliary_plotting_settings.R')) # Load in the two cohorts of candidate gene fusion counts: # -------------------------------------------------------- source(paste0(bindir, 'mosaicism/genefusions/load_fusion_data.R')) source(paste0(bindir, 'mosaicism/genefusions/auxiliary_fusion_plotting.R')) # Use only single cohort: imgpref = paste0(imgpref, "originalcohort_") sub.ctdf = sub.ctdf[sub.ctdf$cohort == 'ALZ',] # Regression estimates for a number of selected covariates: # --------------------------------------------------------- merge_fsdf = function(df, sub.ctdf){ fs.ctdf = agg.rename(annots ~ plate + sample, df, length, 'subset.count') fs.ctdf = merge(sub.ctdf, fs.ctdf, all.x=TRUE) fs.ctdf$subset.count[is.na(fs.ctdf$subset.count)] = 0 return(fs.ctdf) } run_regr = function(ctdf, only.exc, use.cc=TRUE, use.ec=TRUE){ resdf = c() if (only.exc){ df = ctdf[ctdf$relabel == 'Excitatory',] } else { df = ctdf } if (length(unique(df$platform)) > 1){ # count.covar = '+ platform * (n_counts + n_genes + count_per_gene)' count.covar = '+ platform * (log(n_genes) + count_per_gene)' } else { # count.covar = '+ n_counts + n_genes + count_per_gene' # original count.covar = '+ log(n_genes) + count_per_gene' } for (var in test.vars){ # form.vec = c('subset.count', ' ~ ', var) # original form.vec = c('subset.count ~ offset(log(n_counts)) + ', var) if (use.cc){ form.vec = c(form.vec, count.covar) } if (use.ec){ form.vec = c(form.vec, ext.covar) } if ((!only.exc) & (length(unique(ctdf$relabel)) > 1)){ form.vec = c(form.vec, ' + relabel') } # Fit model + process fit: form = asform(c(form.vec)) fit = try(glm.nb(form, df)) # fit = try(glm(form, df, family='poisson')) if ((class(fit)[1] != 'try-error')[1]){ cfit = processLmFit(fit) cfit$var = var resdf = rbind(resdf, cfit[2,]) } } resdf = resdf[order(resdf$p),] resdf$padj = p.adjust(resdf$p, method='fdr') resdf$col = with(resdf, ifelse(padj < 0.05, ifelse(Est < 0, 'Down', 'Up'), 'NS')) resdf = resdf[order(resdf$Est),] resdf$coef = varmap[resdf$coef] resdf$coef = factor(resdf$coef, levels=resdf$coef) return(resdf) } # Test in each subset of fusion types separately: # ----------------------------------------------- fulldf = c() for (only.exc in c(TRUE, FALSE)){ for (fs.subset in c('INTRA','INTER','CLOSE')){ if (fs.subset == 'CLOSE'){ df = fsdf[(fsdf$fs.set == 'INTRA') & (fsdf$Distance < 5e6),] } else { df = fsdf[fsdf$fs.set == fs.subset,] } fs.ctdf = merge_fsdf(df, sub.ctdf) # Regression results: resdf = run_regr(fs.ctdf, only.exc) resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells') resdf$type = fs.subset fulldf = rbind(fulldf, resdf) } } gp = coefErrorbarPlot(fulldf, facet='set ~ type') regpref = paste0(imgpref, 'reg_effectsizes_core_type_joint_results_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=4.5) # Test with other cutoffs: # ------------------------ # W/ LDAS (582) # W/ REF SPLICE # By cell type # Lower ENTROPY cutoff (max) fulldf = c() only.exc = FALSE for (fs.subset in c('YES_LDAS','NO_LDAS')){ df = fsdf[fsdf$LargeAnchorSupport == fs.subset,] fs.ctdf = merge_fsdf(df, sub.ctdf) # Regression results: resdf = run_regr(fs.ctdf, only.exc) resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells') resdf$type = fs.subset fulldf = rbind(fulldf, resdf) } gp = coefErrorbarPlot(fulldf, facet='set ~ type') regpref = paste0(imgpref, 'reg_effectsizes_LDAS_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=7.5, h=3) # W/ REF SPLICE (612) fulldf = c() only.exc = FALSE for (fs.subset in c('INCL_NON_REF_SPLICE', 'ONLY_REF_SPLICE')){ df = fsdf[fsdf$SpliceType == fs.subset,] fs.ctdf = merge_fsdf(df, sub.ctdf) # Regression results: resdf = run_regr(fs.ctdf, only.exc) resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells') resdf$type = fs.subset fulldf = rbind(fulldf, resdf) } gp = coefErrorbarPlot(fulldf, facet='set ~ type') regpref = paste0(imgpref, 'reg_effectsizes_SpliceType_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=3) # Lower ENTROPY cutoff (max) fulldf = c() only.exc = FALSE for (fs.subset in c(1.5, 1.75, 2)){ df = fsdf[fsdf$MaxEntropy < fs.subset,] fs.ctdf = merge_fsdf(df, sub.ctdf) # Regression results: resdf = run_regr(fs.ctdf, only.exc) resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells') resdf$type = paste0('Max Ent < ', fs.subset) fulldf = rbind(fulldf, resdf) } gp = coefErrorbarPlot(fulldf, facet='set ~ type') regpref = paste0(imgpref, 'reg_effectsizes_MaxEntropy_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=3) # Lower ENTROPY cutoff (min) fsdf$MinEntropy = apply(fsdf[,c('LeftBreakEntropy','RightBreakEntropy')], 1, min) fulldf = c() only.exc = FALSE for (fs.subset in c(1.25, 1.5, 1.75)){ df = fsdf[fsdf$MinEntropy > fs.subset,] fs.ctdf = merge_fsdf(df, sub.ctdf) # Regression results: resdf = run_regr(fs.ctdf, only.exc) resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells') resdf$type = paste0('Min Ent > ', fs.subset) fulldf = rbind(fulldf, resdf) } gp = coefErrorbarPlot(fulldf, facet='set ~ type') regpref = paste0(imgpref, 'reg_effectsizes_MinEntropy_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=3) # Stratify by cell type and plot heatmap as well as intervals: # ------------------------------------------------------------ fulldf = c() cts = unique(sub.ctdf$relabel) for (fs.subset in cts){ df = fsdf fs.ctdf = merge_fsdf(df, sub.ctdf) fs.ctdf = fs.ctdf[fs.ctdf$relabel == fs.subset,] # Regression results: resdf = run_regr(fs.ctdf, only.exc=FALSE) resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells') resdf$type = fs.subset fulldf = rbind(fulldf, resdf) } gp = coefErrorbarPlot(fulldf, facet='. ~ type') regpref = paste0(imgpref, 'reg_effectsizes_CellType_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=3) # Plot the cell type results as a heatmap: # ----------------------------------------- cmat = pivot.tomatrix(fulldf[,c('Est','coef','type')], 'type','Est') pmat = pivot.tomatrix(fulldf[,c('padj','coef','type')], 'type','padj') mx = 0.5 mat.col_fun = colorRamp2(c(-mx, 0, mx), c("blue", "white", "red")) ux = 1.5 ht = Heatmap(cmat, name='Est', col=mat.col_fun, width = ncol(cmat)*unit(ux, "mm"), height = nrow(cmat)*unit(ux, "mm"), border_gp=gpar(color='black', lwd=.5), cell_fun = function(j, i, x, y, w, h, col){ if (pmat[i,j] < 0.05){ grid.text('*', x, y, gp=gpar(fontsize=gridtxt.fs)) } } ) h = 2 + 1 / 15 * nrow(cmat) w = 2 + 1 / 15 * ncol(cmat) pltpref = paste0(imgpref, 'reg_effectsizes_CellType_heatmap') saveHeatmap(ht, pltpref, h=h, w=w) # Regressions by TAD distance: # ---------------------------- # Read in TAD boundaries from 3dgenomebrowser tdf = read.delim(paste0(dbdir, 'Annotation/Cortex_DLPFC_Donor-CO-raw_TADs.txt'), header=F) tadf = tdf[,c(1:2)] tbdf = tdf[,c(1,3)] names(tadf) = c('chr','pos') names(tbdf) = c('chr','pos') tdf = rbind(tadf, tbdf) tdf = tdf[order(tdf$pos),] tdf = tdf[order(tdf$chr),] tdf = unique(tdf) rownames(tdf) = NULL tgr = GRanges(seqnames=tdf$chr, IRanges(start=tdf$pos, end=tdf$pos)) # Make right/left objects: fsdf$LeftChr = sub(":.*","", fsdf$LeftBreakpoint) fsdf$RightChr = sub(":.*","", fsdf$RightBreakpoint) lgr = with(fsdf, GRanges(seqnames=LeftChr, IRanges(start=LeftPosition, end=LeftPosition))) rgr = with(fsdf, GRanges(seqnames=RightChr, IRanges(start=RightPosition, end=RightPosition))) # Get the closest left + right tads for each rnear = nearest(rgr, tgr) fsdf$RightNearPos = tdf$pos[rnear] fsdf$RightTADDist = abs(fsdf$RightNearPos - fsdf$RightPosition) lnear = nearest(lgr, tgr) fsdf$LeftNearPos = tdf$pos[lnear] fsdf$LeftTADDist = abs(fsdf$LeftNearPos - fsdf$LeftPosition) fsdf$MinTADDist = apply(fsdf[,c('RightTADDist','LeftTADDist')], 1, min) fsdf$MaxTADDist = apply(fsdf[,c('RightTADDist','LeftTADDist')], 1, max) gp = ggplot(fsdf, aes(LeftTADDist, RightTADDist)) + geom_density_2d_filled() + # geom_point(alpha=0.1, cex=.5) + scale_x_continuous(expand=c(0,0), labels=scales::comma) + scale_y_continuous(expand=c(0,0), labels=scales::comma) + theme_pubr() saveGGplot(gp, paste0(imgpref, 'TAD_distance_2d'), w=6, h=6) # Cut TAD distance evenly: qtTAD = quantile(fsdf$MinTADDist, seq(0,1, .25), na.rm=T) fsdf$TAD.bin = cut(fsdf$MinTADDist, breaks=qtTAD) gp = ggplot(fsdf, aes(MinTADDist, fill=TAD.bin)) + # geom_histogram(fill='grey80', color='white') + geom_histogram(color='white') + geom_density(color='black') + scale_x_continuous(expand=c(0,0), labels=scales::comma) + scale_y_continuous(expand=c(0,0)) + theme_pubr() saveGGplot(gp, paste0(imgpref, 'TAD_mindistance'), w=5, h=4) fulldf = c() only.exc = FALSE fsdf$TAD.bin = as.character(fsdf$TAD.bin) fsdf$TAD.bin[is.na(fsdf$TAD.bin)] = 'Undefined' sets = unique(fsdf$TAD.bin[order(fsdf$MinTADDist)]) sets = sets[sets != 'Undefined'] for (fs.subset in sets){ df = fsdf[fsdf$TAD.bin == fs.subset,] fs.ctdf = merge_fsdf(df, sub.ctdf) # Regression results: resdf = run_regr(fs.ctdf, only.exc) resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells') resdf$type = fs.subset fulldf = rbind(fulldf, resdf) } fulldf$type = factor(fulldf$type, levels=sets) gp = coefErrorbarPlot(fulldf, facet='set ~ type') regpref = paste0(imgpref, 'reg_effectsizes_MinTADDist_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=3) # By Max TAD bin instead: # ----------------------- qtTAD = quantile(fsdf$MaxTADDist, seq(0,1, .25), na.rm=T) fsdf$TAD.bin = cut(fsdf$MaxTADDist, breaks=qtTAD) gp = ggplot(fsdf, aes(MaxTADDist, fill=TAD.bin)) + geom_histogram(color='white') + geom_density(color='black') + scale_x_continuous(expand=c(0,0), labels=scales::comma) + scale_y_continuous(expand=c(0,0)) + theme_pubr() saveGGplot(gp, paste0(imgpref, 'TAD_maxdistance'), w=5, h=4) fulldf = c() only.exc = FALSE fsdf$TAD.bin = as.character(fsdf$TAD.bin) fsdf$TAD.bin[is.na(fsdf$TAD.bin)] = 'Undefined' sets = unique(fsdf$TAD.bin[order(fsdf$MaxTADDist)]) sets = sets[sets != 'Undefined'] for (fs.subset in sets){ df = fsdf[fsdf$TAD.bin == fs.subset,] fs.ctdf = merge_fsdf(df, sub.ctdf) # Regression results: resdf = run_regr(fs.ctdf, only.exc) resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells') resdf$type = fs.subset fulldf = rbind(fulldf, resdf) } fulldf$type = factor(fulldf$type, levels=sets) gp = coefErrorbarPlot(fulldf, facet='set ~ type') regpref = paste0(imgpref, 'reg_effectsizes_MaxTADDist_') saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=3)