#!/usr/bin/R # ------------------------------------------------ # Estimate effect sizes for gene candidate fusions # --for different sets of fusions # --follows P301S, updated for human # Updated: 03/28/23 # ------------------------------------------------ library(cbrbase) set_proj('DEVTRAJ', 'mosaicism') library(MASS) library(ggplot2) library(ggpubr) source(paste0(bindir, 'mosaicism/fusions_AD430/load_metadata.R')) source(paste0(bindir, 'multiRegion/auxiliary_plotting_settings.R')) # Directories: plotdir = paste0(imgdir, 'fusions/') imgpref = paste0(plotdir, 'AD430_fusions_') cmd = paste('mkdir -p', plotdir) system(cmd) # Load in the gene fusion data: # ----------------------------- source(paste0(bindir, 'mosaicism/fusions_AD430/load_fusioncalls.R')) source(paste0(bindir, 'mosaicism/genefusions/auxiliary_fusion_plotting.R')) sub.ctdf = sub.ctdf[grep("^SM_171013", sub.ctdf$dataset, invert=TRUE),] # Remove batch (v2) # Load the filtered fusions: # -------------------------- filtfile = paste0(fsdir, prefix, '.filtered.nonmerge.tsv') filt.fsdf = read.delim(filtfile, header=T) cellmeta = merge(cellmeta, unique(fus.meta[,c('dataset','projid')])) # For merging into filtered fusions # Merge the m.fsdf = aggregate(JunctionReadCount ~ LeftBreakpoint + RightBreakpoint + LeftSymbol + RightSymbol + SpliceType + projid + sample + CB + annots, filt.fsdf, sum) dim(m.fsdf) # merged.fsdf = merge(merged.fsdf, unique(fus.meta[,c('sample','dataset')]), all.x=TRUE) # merged.fsdf$barcode = with(merged.fsdf, paste0(dataset, ':', CB)) # Regression estimates for a number of selected covariates: # --------------------------------------------------------- run_regr = function(ctdf, only.exc, test.vars=test.vars, use.cc=TRUE, use.ec=TRUE, reg.var='subset.count'){ resdf = c() if (only.exc){ df = ctdf[ctdf$celltype == 'Excitatory',] } else { df = ctdf } for (var in test.vars){ cat('...', var) form.vec = c(reg.var, '~ offset(log(n_counts)) + ', var) if (use.cc){ form.vec = c(form.vec, count.covar) } if (use.ec){ form.vec = c(form.vec, ext.covar) } if (length(unique(ctdf$celltype)) > 1){ form.vec = c(form.vec, ' + celltype') } # Fit model + process fit: form = asform(c(form.vec)) fit = try(glm.nb(form, df)) if ((class(fit)[1] != 'try-error')[1]){ cfit = processLmFit(fit) cfit$var = var resdf = rbind(resdf, cfit[2,]) cat(' -', sprintf("%0.2e",cfit$p[2]), '\n') } } resdf = resdf[order(resdf$p),] resdf$padj = p.adjust(resdf$p, method='fdr') resdf$col = with(resdf, ifelse(padj < 0.05, ifelse(Est < 0, 'Down', 'Up'), 'NS')) resdf = resdf[order(resdf$Est),] resdf$coef = varmap[resdf$coef] resdf$coef = factor(resdf$coef, levels=resdf$coef) return(resdf) } # Functions to create tables for each split: # ------------------------------------------ # Distance: getPos = function(x){ x = strsplit(x,":")[[1]][2] as.numeric(x) } aggregate.filt.fusions = function(sub.filt.fsdf){ # Aggregate fusions to sample level, merging lanes: merged.fusions.df = aggregate(JunctionReadCount ~ LeftBreakpoint + RightBreakpoint + fs.set + LeftSymbol + RightSymbol + SpliceType + projid + sample + CB + annots, sub.filt.fsdf, sum) merged.fusions.df = merged.fusions.df[merged.fusions.df$fs.set %in% c('INTER','INTRA'),] dim(merged.fusions.df) return(merged.fusions.df) # Further process fusions by getting distance, filtering STAR-flagged: # -------------------------------------------------------------------- # Process into subcategories: # merged.fusions.df$fs.set = substr(merged.fusions.df$annots, 2, 6) # merged.fusions.df = merged.fusions.df[merged.fusions.df$fs.set %in% c('INTER','INTRA'),] # merged.fusions.df$LeftPosition = sapply(merged.fusions.df$LeftBreakpoint, getPos) # merged.fusions.df$RightPosition = sapply(merged.fusions.df$RightBreakpoint, getPos) # merged.fusions.df$Distance = NA # intra.ind = merged.fusions.df$fs.set == 'INTRA' # merged.fusions.df$Distance[intra.ind] = abs(merged.fusions.df$RightPosition - merged.fusions.df$LeftPosition)[intra.ind] } # Merge into counts per barcode: merge.tocounts = function(merged.fusions.df, cellmeta){ new.ctdf = agg.rename(annots ~ barcode + dataset, merged.fusions.df, length, 'count') new.ctdf = merge(cellmeta, new.ctdf, all.x=TRUE) new.ctdf$count[is.na(new.ctdf$count)] = 0 new.ctdf = new.ctdf[grep("^SM_171013", new.ctdf$dataset, invert=TRUE),] # Remove batch (v2) # Add metadata covariates: advars = c('niareagansc','nrad','braaksc','bd', 'bd2','cogdx.ad') covars = c('age_rescaled','msex','pmi_rescaled', 'Apoe_e4') mcols = c('projid', advars, covars) new.ctdf = merge(new.ctdf, unique(metadata[,mcols])) new.ctdf$count_per_gene = with(new.ctdf, n_counts / n_genes) return(new.ctdf) } # Annotate additional variables, run splits on merged.fsdf: # --------------------------------------------------------- merged.fsdf$JRC = ifelse(merged.fsdf$JunctionReadCount == 1, 'Single Read', 'Multiple Reads') merged.fusion.vars = c('JRC', 'SpliceType', 'fs.set') jrdf = c() fulldf = c() for (fusion.var in merged.fusion.vars){ fs.types = unique(merged.fsdf[[fusion.var]]) for (fs.type in fs.types){ cat(fusion.var, '--', fs.type, '\n') sub.mfsdf = merged.fsdf[merged.fsdf[[fusion.var]] == fs.type,] sub.mfsdf = merge(sub.mfsdf, unique(fus.meta[,c('sample','dataset')]), all.x=TRUE) sub.mfsdf$barcode = with(sub.mfsdf, paste0(dataset, ':', CB)) sub.mfsdf = sub.mfsdf[!is.na(sub.mfsdf$dataset),] new.ctdf = merge.tocounts(sub.mfsdf, cellmeta=cellmeta) nfusion = sum(new.ctdf$count) jrdf = rbind(jrdf, data.frame(nfusion=nfusion, set=fs.type, fusion.split=fusion.var)) # Run the regression resdf = run_regr(new.ctdf, test.vars=c('bd','nrad'), only.exc=FALSE, reg.var='count') resdf$set = fs.type resdf$fusion.split = fusion.var fulldf = rbind(fulldf, resdf) cat('\n') } } # Run splits that are in filt.fsdf instead: # ----------------------------------------- filt.fusion.vars = c('LargeAnchorSupport', 'LeftBreakDinuc', 'RightBreakDinuc') for (fusion.var in filt.fusion.vars){ fs.types = unique(filt.fsdf[[fusion.var]]) for (fs.type in fs.types){ cat(fusion.var, '--', fs.type, '\n') # Get subset of filtered fusions: sub.filt.fsdf = filt.fsdf[filt.fsdf[[fusion.var]] == fs.type,] sub.mfsdf = aggregate.filt.fusions(sub.filt.fsdf) # Annotate + merge: sub.mfsdf = merge(sub.mfsdf, unique(fus.meta[,c('sample','dataset')]), all.x=TRUE) sub.mfsdf$barcode = with(sub.mfsdf, paste0(dataset, ':', CB)) sub.mfsdf = sub.mfsdf[!is.na(sub.mfsdf$dataset),] new.ctdf = merge.tocounts(sub.mfsdf, cellmeta=cellmeta) nfusion = sum(new.ctdf$count) jrdf = rbind(jrdf, data.frame(nfusion=nfusion, set=fs.type, fusion.split=fusion.var)) # Run the regression resdf = run_regr(new.ctdf, test.vars=c('bd', 'nrad'), only.exc=FALSE, reg.var='count') resdf$set = fs.type resdf$fusion.split = fusion.var fulldf = rbind(fulldf, resdf) cat('\n') } } # Count up how many events in each set (should do sep) # ------------------------------------- jrdf = c() for (fusion.var in merged.fusion.vars){ fs.types = unique(merged.fsdf[[fusion.var]]) for (fs.type in fs.types){ # Get subset of filtered fusions: sub.mfsdf = merged.fsdf[merged.fsdf[[fusion.var]] == fs.type,] sub.mfsdf = merge(sub.mfsdf, unique(fus.meta[,c('sample','dataset')]), all.x=TRUE) sub.mfsdf = merge(sub.mfsdf, unique(fus.meta[,c('sample','dataset')]), all.x=TRUE) sub.mfsdf$barcode = with(sub.mfsdf, paste0(dataset, ':', CB)) sub.mfsdf = sub.mfsdf[!is.na(sub.mfsdf$dataset),] new.ctdf = merge.tocounts(sub.mfsdf, cellmeta=cellmeta) nfusion = sum(new.ctdf$count) jrdf = rbind(jrdf, data.frame(nfusion=nfusion, set=fs.type, fusion.split=fusion.var)) } } for (fusion.var in filt.fusion.vars){ fs.types = unique(filt.fsdf[[fusion.var]]) for (fs.type in fs.types){ cat(fusion.var, '--', fs.type, '\n') # Get subset of filtered fusions: sub.filt.fsdf = filt.fsdf[filt.fsdf[[fusion.var]] == fs.type,] sub.mfsdf = aggregate.filt.fusions(sub.filt.fsdf) # Annotate + merge: sub.mfsdf = merge(sub.mfsdf, unique(fus.meta[,c('sample','dataset')]), all.x=TRUE) sub.mfsdf$barcode = with(sub.mfsdf, paste0(dataset, ':', CB)) sub.mfsdf = sub.mfsdf[!is.na(sub.mfsdf$dataset),] new.ctdf = merge.tocounts(sub.mfsdf, cellmeta=cellmeta) nfusion = sum(new.ctdf$count) jrdf = rbind(jrdf, data.frame(nfusion=nfusion, set=fs.type, fusion.split=fusion.var)) } } # Plot all of these different variables: # -------------------------------------- pltdf = merge(fulldf, jrdf) pltdf$full.set = with(pltdf, paste0(fusion.split, ' - ', set, ' (', nfusion, ')')) # TODO: Add the counts labdf = pltdf[pltdf$padj < 0.05,] labdf$lab = paste0('p=', sprintf('%0.1e', labdf$padj)) ypos = 0.2 gp = ggplot(pltdf, aes(full.set, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + facet_grid(. ~ coef) + geom_point() + geom_errorbar(width=.2) + geom_hline(yintercept=0, lty='dotted') + geom_text(data=labdf, aes(full.set, y=ypos, label=lab), color='black') + scale_color_manual(values=devals, name='Significance:') + theme_pubr() + coord_flip() + labs(x='Condition', y='Estimated effect size') regpref = paste0(imgpref, 'reg_effectsizes_results_bytype_full') saveGGplot(gp, paste0(regpref, 'errorbar'), w=9, h=3.5) for (fusion.var in unique(fulldf$fusion.split)){ print(fusion.var) resdf = fulldf[fulldf$fusion.split == fusion.var,] labdf = resdf[resdf$padj < 0.05,] labdf$lab = paste0('p=', sprintf('%0.1e', labdf$padj)) ypos = 0.2 gp = ggplot(resdf, aes(coef, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + facet_grid(. ~ set) + geom_point() + geom_errorbar(width=.2) + geom_hline(yintercept=0, lty='dotted') + geom_text(data=labdf, aes(coef, y=ypos, label=lab), color='black') + scale_color_manual(values=devals, name='Significance:') + theme_pubr() + coord_flip() + labs(x='Condition', y='Estimated effect size') regpref = paste0(imgpref, 'reg_effectsizes_results_bytype_', fusion.var) saveGGplot(gp, paste0(regpref, 'errorbar'), w=8, h=4) }