#!/usr/bin/R # ------------------------------------------------ # Calculate the overlaps between fusion genes # and other gene sets # - hypergeometric # - regression with covars for expr, length # Updated: 03/29/23 # ------------------------------------------------ library(cbrbase) set_proj('DEVTRAJ') # Libraries: library(tidyr) library(ggplot2) library(MASS) library(ggpubr) library(RColorBrewer) options(width=170) # Load in and process data (saves to matrices): # project = 'ALZ_repl' project = 'ALZ' commandArgs <- function(){ c(project)} source(paste0(bindir, 'mosaicism/load_metadata.R')) # Make directories for analysis: analysis = 'fusion' imgdir = paste0(img, project, '/', analysis , '/') imgpref = paste0(imgdir, analysis, "_mspred_", project, '_') cmd = paste0('mkdir -p ', imgdir) system(cmd) # Load in the two cohorts of candidate gene fusion counts: # -------------------------------------------------------- source(paste0(bindir, 'mosaicism/genefusions/load_fusion_data.R')) source(paste0(bindir, 'mosaicism/genefusions/auxiliary_fusion_plotting.R')) # Statistics on gene distance: # ---------------------------- # Get order of genes, by TSS: gencode_version = 'v28.primary_assembly' geneanno = read.delim(paste0('Annotation/gene_info.', gencode_version, '.annotation.gtf'), header=F, stringsAsFactors=F, sep=" ") names(geneanno) = c('chr', 'start', 'end', 'strand', 'symbol') geneanno$length = with(geneanno, end - start) # Expression statistics: # NOTE: ONLY FOR EXCITATORY NEURONS: # ---------------------- # Read in number of counts from the snPFC dataset: count.df = read.delim(paste0(dbdir, 'snPFC_exc_nCounts.csv'), header=T, sep=",") count.df = count.df[,c('index','n_counts')] names(count.df) = c('symbol', 'n_counts') nexc = nrow(count.df) expr.genes = unique(count.df$symbol) count.df = merge(count.df, geneanno[,c('symbol', 'length')], all.x=TRUE, all.y=TRUE) # Read gene sets in: # ------------------ # Signatures sigfile = 'Annotation/fusion_signature_lists.tsv' sgdf = read.delim(sigfile, header=T) kwds = unique(sgdf$kwd) genelist = lapply(kwds, function(x){ sgdf$gene[sgdf$kwd == x] }) names(genelist) = kwds # GWAS: gwdf = read.delim(paste0('Annotation/20210915_ADGENES_CHROM_Tanzi.tsv'), header=T) genelist[['GWAS']] = unique(gwdf$gene[gwdf$evidence == 'GWAS']) # RDC: genelist[['RDC']] = read.delim(paste0(dbdir, '10xONT/rdc_genes.tsv'))[,2] # Alternative GWAS list from Vishnu gwas = read.delim("Annotation/efotraits_MONDO_0004975-associations-2023-03-14.txt", header=T) gwas = unique(gwas$Mapped.gene) ##1516## gwas = unique(unlist(strsplit(gwas, split=", "))) ##1218 gwas = intersect(gwas, expr.genes) genelist[['GWAS2']] = gwas # Filter to excitatory, make sure coincides with expr.genes # --------------------------------------------------------- fsdf = fsdf[fsdf$cohort == 'ALZ',] fsdf = merge(fsdf, meta[,c('sample','relabel')]) exc.fsdf = fsdf[fsdf$relabel == 'Excitatory',] exc.fsdf = exc.fsdf[(exc.fsdf$LeftSymbol %in% expr.genes) & (exc.fsdf$RightSymbol %in% expr.genes),] exc.genes = unique(c(exc.fsdf$LeftSymbol, exc.fsdf$RightSymbol)) hdf = as.data.frame(table(c(exc.fsdf$LeftSymbol, exc.fsdf$RightSymbol))) names(hdf) = c('symbol', 'nfusion') # Run hypergeometric test for all sets: # ------------------------------------- sets = names(genelist) smat = sapply(sets, function(x){ genes = intersect(genelist[[x]], expr.genes) c(length(intersect(genes, exc.genes)), length(genes)) }) cdf = data.frame(nint=smat[1,], nset=smat[2,], nfusion=length(exc.genes), ntot=length(expr.genes)) cdf$p = apply(cdf[,c('nint','nset','nfusion','ntot')], 1, run.hyper) cdf$log2FC = log2((cdf$nint / cdf$nset) / (cdf$nfusion / cdf$ntot)) cdf$set = rownames(cdf) cdf$p.adj = p.adjust(cdf$p, 'BH') cdf = cdf[order(cdf$log2FC),] cdf$set = factor(cdf$set, levels=cdf$set) gp = ggplot(cdf, aes(log2FC, set, size=-log10(p.adj))) + geom_point() + geom_segment(data=cdf, aes(x=0, y=set, xend=log2FC, yend=set), size=.25, lty='dashed') + geom_vline(xintercept=0, lty='dotted') + scale_x_continuous(labels=scales::comma, expand=expansion(mult=c(0, .1))) + theme_pubr() pltprefix = paste0(imgpref, 'hypergeom_bubbleplot') saveGGplot(gp, pltprefix, w=4.5, h=4) # Instead by regression, accounting for length + expression # --------------------------------------------------------- df = count.df[count.df$symbol %in% expr.genes,] df = merge(df, hdf, all.x=TRUE) df$nfusion[is.na(df$nfusion)] = 0 df$hasfusion = (df$nfusion > 0) * 1 for (set in c('RDC', 'GWAS', 'GWAS2')){ df[[set]] = df$symbol %in% genelist[[set]] } # Save these for sharing genefile = 'ALZ_smartseq2_exprgenes_withfusions.tsv' write.table(df, genefile, quote=F, sep="\t", row.names=F) gwdf = df[df$GWAS,] gwdf = gwdf[order(gwdf$nfusion, decreasing=T),] # Run regressions (same as analysis for 10x fusions) reg.var = 'hasfusion' reg.var = 'nfusion' fulldf = c() for (reg.var in c('hasfusion', 'nfusion')){ for (use.covars in c(TRUE, FALSE)){ # Run all sets for this condition: resdf = c() for (set in sets){ print(set) df$in.set = df$symbol %in% genelist[[set]] form.vec = c(reg.var, '~ in.set') if (use.covars){ form.vec = c(form.vec, ' + log(n_counts) + log(length)') } form = asform(form.vec) if (reg.var == 'hasfusion'){ # Logistic regression: fit = glm(form, df, family=binomial('logit')) } else { # NB regression: fit = glm.nb(form, df) } cfit = as.data.frame(coefficients(summary(fit))) names(cfit) = c('Est', 'SE', 'z', 'p') cfit$set = set cfit$reg.var = reg.var cfit$covars = ifelse(use.covars, 'AdjCovar', 'NoCovar') resdf = rbind(resdf, cfit[2,, drop=F]) } resdf = resdf[order(resdf$p),] resdf$p.adj = p.adjust(resdf$p, 'BH') resdf$col = ifelse(resdf$p.adj < 0.05, ifelse(resdf$Est > 0, 'Up', 'Down'), 'NS') fulldf = rbind(fulldf, resdf) } } # Plot these as bubble plots: # --------------------------- resdf = fulldf[(fulldf$covars == 'AdjCovar') & (fulldf$reg.var == 'nfusion'),] resdf = resdf[order(resdf$Est),] resdf$set = factor(resdf$set, levels=resdf$set) gp = ggplot(resdf, aes(Est, set, size=-log10(p), color=col)) + geom_point() + geom_segment(data=resdf, aes(x=0, y=set, xend=Est, yend=set), size=.25, lty='dashed') + geom_vline(xintercept=0, lty='dotted') + scale_x_continuous(labels=scales::comma, expand=expansion(mult=c(.1, .1))) + scale_color_manual(values=devals) + labs(x='Estimated effect of set on # fusions\n(NB reg. controlling for length, expr.)') + theme_pubr() pltprefix = paste0(imgpref, 'geneset_overlap_regression_adjNFusion_bubbleplot') saveGGplot(gp, pltprefix, w=5, h=4) resdf = fulldf[(fulldf$covars == 'NoCovar') & (fulldf$reg.var == 'nfusion'),] resdf = resdf[order(resdf$Est),] resdf$set = factor(resdf$set, levels=resdf$set) gp = ggplot(resdf, aes(Est, set, size=-log10(p), color=col)) + geom_point() + geom_segment(data=resdf, aes(x=0, y=set, xend=Est, yend=set), size=.25, lty='dashed') + geom_vline(xintercept=0, lty='dotted') + scale_x_continuous(labels=scales::comma, expand=expansion(mult=c(.1, .1))) + scale_color_manual(values=devals) + labs(x='Estimated effect of set on # fusions\n(NB regression)') + theme_pubr() pltprefix = paste0(imgpref, 'geneset_overlap_regression_NoadjNFusion_bubbleplot') saveGGplot(gp, pltprefix, w=5, h=4)