#!/usr/bin/R # --------------------------------------------- # Calculate the overlaps between fusion genes # and other gene sets # - hypergeometric # - regression with covars for expr, length # - other analyses (interaction terms, reviews) # Updated: 03/29/23 # --------------------------------------------- library(cbrbase) set_proj('DEVTRAJ', 'mosaicism') library(MASS) library(ggplot2) library(ggpubr) library(ggrepel) library(ComplexHeatmap) library(circlize) source(paste0(bindir, 'mosaicism/fusions_AD430/load_metadata.R')) source(paste0(bindir, 'multiRegion/auxiliary_plotting_settings.R')) # Directories: plotdir = paste0(imgdir, 'fusions/') imgpref = paste0(plotdir, 'AD430_fusions_') cmd = paste('mkdir -p', plotdir) system(cmd) # Load in the gene fusion data: # ----------------------------- source(paste0(bindir, 'mosaicism/fusions_AD430/load_fusioncalls.R')) source(paste0(bindir, 'mosaicism/genefusions/auxiliary_fusion_plotting.R')) pcols = brewer.pal(12, 'Paired') devals = c('NS'='grey80', 'Down'=pcols[2], 'Up'=pcols[6]) # Statistics on gene distance: # ---------------------------- # Get order of genes, by TSS: gencode_version = 'v28.primary_assembly' geneanno = read.delim(paste0('Annotation/gene_info.', gencode_version, '.annotation.gtf'), header=F, stringsAsFactors=F, sep=" ") names(geneanno) = c('chr', 'start', 'end', 'strand', 'symbol') geneanno$length = with(geneanno, end - start) # Expression statistics: # NOTE: ONLY FOR EXCITATORY NEURONS: # ---------------------- # Read in number of counts from the snPFC dataset: count.df = read.delim(paste0(dbdir, 'snPFC_exc_nCounts.csv'), header=T, sep=",") count.df = count.df[,c('index','n_counts')] names(count.df) = c('symbol', 'n_counts') nexc = nrow(count.df) expr.genes = unique(count.df$symbol) count.df = merge(count.df, geneanno[,c('symbol', 'length')], all.x=TRUE, all.y=TRUE) # Read gene sets in: # ------------------ # Signatures sigfile = 'Annotation/fusion_signature_lists.tsv' sgdf = read.delim(sigfile, header=T) kwds = unique(sgdf$kwd) genelist = lapply(kwds, function(x){ sgdf$gene[sgdf$kwd == x] }) names(genelist) = kwds # Recurrent DSB cluster (RDC) genes: genelist[['RDC']] = read.delim(paste0(dbdir, '10xONT/rdc_genes.tsv'))[,2] # GWAS lists from Vishnu: # Short list: gw1 = read.delim('Annotation/studies_GCST90027158-associations-2023-01-3.csv', header=T, sep=",") gw1 = unique(gw1$Mapped.gene) gw1 = unique(unlist(strsplit(gw1, split=", "))) gw1 = intersect(gw1, expr.genes) genelist[['GWAS']] = gw1 # 91 expr. # Long list: gw2 = read.delim("Annotation/efotraits_MONDO_0004975-associations-2023-03-14.txt", header=T) gw2 = unique(gw2$Mapped.gene) ##1516## gw2 = unique(unlist(strsplit(gw2, split=", "))) ##1218 gw2 = intersect(gw2, expr.genes) genelist[['GWAS2']] = gw2 # 1329 expr # NOTE: Could add these (but won't for now to keep it simple) # AD-DEGs # Housekeeping genes # cell type-specific markers # Get filtered calls, annotate with cell type: # -------------------------------------------- # Get filtered calls: filtfile = paste0(fsdir, prefix, '.filtered.nonmerge.tsv') filt.fsdf = read.delim(filtfile, header=T) # Merge with sample-level metadata: filt.fsdf = merge(filt.fsdf, unique(fus.meta[,c('sample','dataset')]), all.x=TRUE) filt.fsdf$barcode = with(filt.fsdf, paste0(dataset, ':', CB)) filt.fsdf = merge(cellmeta[,c('barcode', 'celltype')], filt.fsdf) # Filter to excitatory, make sure coincides with expr.genes # --------------------------------------------------------- exc.fsdf = filt.fsdf[filt.fsdf$celltype == 'Exc',] exc.fsdf = exc.fsdf[(exc.fsdf$LeftSymbol %in% expr.genes) & (exc.fsdf$RightSymbol %in% expr.genes),] exc.genes = unique(c(exc.fsdf$LeftSymbol, exc.fsdf$RightSymbol)) hdf = as.data.frame(table(c(exc.fsdf$LeftSymbol, exc.fsdf$RightSymbol))) names(hdf) = c('symbol', 'nfusion') # Run hypergeometric test for all sets: # ------------------------------------- sets = names(genelist) smat = sapply(sets, function(x){ genes = intersect(genelist[[x]], expr.genes) c(length(intersect(genes, exc.genes)), length(genes)) }) cdf = data.frame(nint=smat[1,], nset=smat[2,], nfusion=length(exc.genes), ntot=length(expr.genes)) cdf$p = apply(cdf[,c('nint','nset','nfusion','ntot')], 1, run.hyper) cdf$log2FC = log2((cdf$nint / cdf$nset) / (cdf$nfusion / cdf$ntot)) cdf$set = rownames(cdf) cdf$p.adj = p.adjust(cdf$p, 'BH') cdf = cdf[order(cdf$log2FC),] cdf$set = factor(cdf$set, levels=cdf$set) gp = ggplot(cdf, aes(log2FC, set, size=-log10(p.adj))) + geom_point() + geom_segment(data=cdf, aes(x=0, y=set, xend=log2FC, yend=set), size=.25, lty='dashed') + geom_vline(xintercept=0, lty='dotted') + scale_x_continuous(labels=scales::comma, expand=expansion(mult=c(0, .1))) + theme_pubr() pltprefix = paste0(imgpref, 'hypergeom_bubbleplot') saveGGplot(gp, pltprefix, w=4.5, h=4) # Instead by regression, accounting for length + expression # --------------------------------------------------------- df = count.df[count.df$symbol %in% expr.genes,] df = merge(df, hdf, all.x=TRUE) df$nfusion[is.na(df$nfusion)] = 0 df$hasfusion = (df$nfusion > 0) * 1 for (set in c('RDC', 'GWAS', 'GWAS2')){ df[[set]] = df$symbol %in% genelist[[set]] } # Save these for sharing genefile = 'fusions_AD430/fusions_AD430_exprgenes_withfusions.tsv' write.table(df, genefile, quote=F, sep="\t", row.names=F) gwdf = df[df$GWAS,] gwdf = gwdf[order(gwdf$nfusion, decreasing=T),] reg.var = 'hasfusion' reg.var = 'nfusion' fulldf = c() for (reg.var in c('hasfusion', 'nfusion')){ for (use.covars in c(TRUE, FALSE)){ # Run all sets for this condition: resdf = c() for (set in sets){ print(set) df$in.set = df$symbol %in% genelist[[set]] form.vec = c(reg.var, '~ in.set') if (use.covars){ form.vec = c(form.vec, ' + log(n_counts) + log(length)') } form = asform(form.vec) if (reg.var == 'hasfusion'){ # Logistic regression: fit = glm(form, df, family=binomial('logit')) } else { # NB regression: fit = glm.nb(form, df) } cfit = as.data.frame(coefficients(summary(fit))) names(cfit) = c('Est', 'SE', 'z', 'p') cfit$set = set cfit$reg.var = reg.var cfit$covars = ifelse(use.covars, 'AdjCovar', 'NoCovar') resdf = rbind(resdf, cfit[2,, drop=F]) } resdf = resdf[order(resdf$p),] resdf$p.adj = p.adjust(resdf$p, 'BH') resdf$col = ifelse(resdf$p.adj < 0.05, ifelse(resdf$Est > 0, 'Up', 'Down'), 'NS') fulldf = rbind(fulldf, resdf) } } gp = ggplot(fulldf, aes(Est, -log10(p), label=set, color=col)) + facet_wrap(reg.var ~ covars, scales='free') + geom_point() + geom_text_repel(box.padding=0.075, cex=4, point.padding=0.1,max.overlaps=20, min.segment.length=0.1) + geom_vline(xintercept=0, lty='dotted') + scale_y_continuous(labels=scales::comma, expand=expansion(mult=c(0, .1))) + scale_color_manual(values=devals) + theme_pubr() pltprefix = paste0(imgpref, 'geneset_overlap_regression_conditions_volcano') saveGGplot(gp, pltprefix, w=8, h=8) resdf = fulldf[(fulldf$covars == 'AdjCovar') & (fulldf$reg.var == 'nfusion'),] gp = ggplot(resdf, aes(Est, -log10(p), label=set, color=col)) + geom_point() + geom_text_repel(box.padding=0.01, cex=4, point.padding=0.25,max.overlaps=50, min.segment.length=0.1) + geom_vline(xintercept=0, lty='dotted') + scale_y_continuous(labels=scales::comma, expand=expansion(mult=c(0, .1))) + scale_color_manual(values=devals) + theme_pubr() pltprefix = paste0(imgpref, 'geneset_overlap_regression_adjNFusion_volcano') saveGGplot(gp, pltprefix, w=4, h=4.25) # Plot these as bubble plots instead: # ----------------------------------- resdf = fulldf[(fulldf$covars == 'AdjCovar') & (fulldf$reg.var == 'nfusion'),] resdf = resdf[order(resdf$Est),] resdf$set = factor(resdf$set, levels=resdf$set) gp = ggplot(resdf, aes(Est, set, size=-log10(p), color=col)) + geom_point() + geom_segment(data=resdf, aes(x=0, y=set, xend=Est, yend=set), size=.25, lty='dashed') + geom_vline(xintercept=0, lty='dotted') + scale_x_continuous(labels=scales::comma, expand=expansion(mult=c(.1, .1))) + scale_color_manual(values=devals) + labs(x='Estimated effect of set on # fusions\n(NB reg. controlling for length, expr.)') + theme_pubr() pltprefix = paste0(imgpref, 'geneset_overlap_regression_adjNFusion_bubbleplot') saveGGplot(gp, pltprefix, w=5, h=4) resdf = fulldf[(fulldf$covars == 'NoCovar') & (fulldf$reg.var == 'nfusion'),] resdf = resdf[order(resdf$Est),] resdf$set = factor(resdf$set, levels=resdf$set) gp = ggplot(resdf, aes(Est, set, size=-log10(p), color=col)) + geom_point() + geom_segment(data=resdf, aes(x=0, y=set, xend=Est, yend=set), size=.25, lty='dashed') + geom_vline(xintercept=0, lty='dotted') + scale_x_continuous(labels=scales::comma, expand=expansion(mult=c(.1, .1))) + scale_color_manual(values=devals) + labs(x='Estimated effect of set on # fusions\n(NB regression)') + theme_pubr() pltprefix = paste0(imgpref, 'geneset_overlap_regression_NoadjNFusion_bubbleplot') saveGGplot(gp, pltprefix, w=5, h=4) # Add statistics covariates for length and expression # --------------------------------------------------- df = count.df[count.df$symbol %in% expr.genes,] df = merge(df, hdf, all.x=TRUE) df$nfusion[is.na(df$nfusion)] = 0 reg.var = 'nfusion' use.covars = FALSE # Run all sets for this condition: resdf = c() covars = c('log(n_counts)', 'log(length)') for (test.var in covars){ print(test.var) form.vec = c(reg.var, '~', test.var) form = asform(form.vec) if (reg.var == 'hasfusion'){ # Logistic regression: fit = glm(form, df, family=binomial('logit')) } else { # NB regression: fit = glm.nb(form, df) } cfit = as.data.frame(coefficients(summary(fit))) names(cfit) = c('Est', 'SE', 'z', 'p') cfit$set = test.var cfit$reg.var = reg.var cfit$covars = ifelse(use.covars, 'AdjCovar', 'NoCovar') resdf = rbind(resdf, cfit[2,, drop=F]) } resdf = resdf[order(resdf$p),] resdf$p.adj = p.adjust(resdf$p, 'BH') resdf$col = ifelse(resdf$p.adj < 0.05, ifelse(resdf$Est > 0, 'Up', 'Down'), 'NS') fulldf = rbind(fulldf, resdf) # Add to the full dataset: # Add interaction model for AD: # ----------------------------- metadata$bd = 'CTRL' metadata$bd[metadata$braaksc %in% c('5','6')] = 'AD' metadata$nrad = 'CTRL' metadata$nrad[metadata$niareagansc %in% c('1','2')] = 'AD' metadata$nrad = factor(metadata$nrad, levels=c('CTRL', 'AD')) metadata$bd = factor(metadata$bd, levels=c('CTRL', 'AD')) exc.fsdf = merge(exc.fsdf, metadata[,c('projid', 'bd', 'nrad')]) df2 = gather(exc.fsdf[,c('LeftSymbol', 'RightSymbol', 'bd', 'nrad')], set, symbol, -nrad, -bd) intdf = c() for (ad.var in c('nrad', 'bd')){ # Aggregate # fusions by condition nr.df = agg.rename(asform(c('set ~ symbol + ', ad.var)), df2, length, 'nfusion') # Merge with expr.genes: df = count.df[count.df$symbol %in% expr.genes,] df = merge(df, unique(nr.df[, ad.var, drop=F])) df = merge(df, nr.df, all.x=TRUE) df$nfusion[is.na(df$nfusion)] = 0 df$hasfusion = (df$nfusion > 0) * 1 reg.var = 'nfusion' for (use.covars in c(TRUE, FALSE)){ # Run all sets for this condition: resdf = c() for (set in sets){ print(set) df$in.set = df$symbol %in% genelist[[set]] form.vec = c(reg.var, '~ in.set * ', ad.var) # Interaction model if (use.covars){ form.vec = c(form.vec, ' + log(n_counts) + log(length)') } form = asform(form.vec) # NB regression: fit = glm.nb(form, df) cfit = as.data.frame(coefficients(summary(fit))) names(cfit) = c('Est', 'SE', 'z', 'p') cfit$set = set cfit$reg.var = reg.var cfit$covars = ifelse(use.covars, 'AdjCovar', 'NoCovar') cfit$fit.var = rownames(cfit) fitvar = grep(paste0(":", ad.var), rownames(cfit)) resdf = rbind(resdf, cfit[fitvar,, drop=F]) } resdf = resdf[order(resdf$p),] resdf$ad.var = ad.var resdf$p.adj = p.adjust(resdf$p, 'BH') resdf$col = ifelse(resdf$p.adj < 0.05, ifelse(resdf$Est > 0, 'Up', 'Down'), 'NS') intdf = rbind(intdf, resdf) } } resdf = intdf[(intdf$covars == 'NoCovar') & (intdf$ad.var == 'nrad'),] # NOTE: Flip Est, for +interaction in AD resdf = resdf[order(-resdf$Est),] resdf$set = factor(resdf$set, levels=resdf$set) gp = ggplot(resdf, aes(-Est, set, size=-log10(p), color=col)) + geom_point() + geom_segment(data=resdf, aes(x=0, y=set, xend=-Est, yend=set), size=.25, lty='dashed') + geom_vline(xintercept=0, lty='dotted') + scale_x_continuous(labels=scales::comma, expand=expansion(mult=c(.1, .1))) + scale_color_manual(values=devals) + labs(x='Estimated effect of set on # fusions\n(NB regression)') + theme_pubr() pltprefix = paste0(imgpref, 'geneset_overlap_regression_interactionNIAReagan_NoCovarNFusion_bubbleplot') saveGGplot(gp, pltprefix, w=5, h=4) # Plot final version, with length + interactions: # ----------------------------------------------- sub.intdf = intdf[(intdf$covars == 'NoCovar') & (intdf$ad.var == 'nrad'),] sub.intdf$fit.var = NULL sub.intdf$ad.var = NULL sub.intdf$type = ' * AD' sub.intdf$Est = -sub.intdf$Est # Flip for AD effect resdf = fulldf[(fulldf$covars == 'NoCovar') & (fulldf$reg.var == 'nfusion'),] resdf$type = '' resdf = rbind(resdf, sub.intdf) resdf = resdf[resdf$set %in% c(covars, 'GWAS2', 'GWAS', 'RDC'),] resdf$set = paste0(resdf$set, resdf$type) # Adjust within this test set: resdf$p.adj = p.adjust(resdf$p, 'BH') resdf$col = ifelse(resdf$p.adj < 0.05, ifelse(resdf$Est > 0, 'Up', 'Down'), 'NS') resdf = resdf[order(resdf$Est),] resdf$set = factor(resdf$set, levels=resdf$set) resdf$lp = -log10(resdf$p) resdf$lp[resdf$lp > 30] = 30 gp = ggplot(resdf, aes(Est, set, size=lp, color=col)) + geom_point() + geom_segment(data=resdf, aes(x=0, y=set, xend=Est, yend=set), size=.25, lty='dashed') + geom_vline(xintercept=0, lty='dotted') + scale_x_continuous(labels=scales::comma, expand=expansion(mult=c(.1, .1))) + scale_color_manual(values=devals) + labs(x='Estimated effect of set on # fusions\n(NB regression)') + theme_pubr() pltprefix = paste0(imgpref, 'geneset_overlap_regression_joint_NoadjNFusion_bubbleplot') saveGGplot(gp, pltprefix, w=5, h=3) # Interaction model of length * AD for reviewers: # ----------------------------------------------- reg.var = 'nfusion' use.covars = FALSE # Run all sets for this condition: cvintdf = c() covars = c('log(n_counts)', 'log(length)') for (ad.var in c('nrad', 'bd')){ # Aggregate # fusions by condition nr.df = agg.rename(asform(c('set ~ symbol + ', ad.var)), df2, length, 'nfusion') # Merge with expr.genes: df = count.df[count.df$symbol %in% expr.genes,] df = merge(df, unique(nr.df[, ad.var, drop=F])) df = merge(df, nr.df, all.x=TRUE) df$nfusion[is.na(df$nfusion)] = 0 df$hasfusion = (df$nfusion > 0) * 1 resdf = c() for (test.var in covars){ print(test.var) form.vec = c(reg.var, '~', test.var, '*', ad.var) form = asform(form.vec) # NB regression: fit = glm.nb(form, df) cfit = as.data.frame(coefficients(summary(fit))) names(cfit) = c('Est', 'SE', 'z', 'p') cfit$set = test.var cfit$reg.var = reg.var cfit$covars = ifelse(use.covars, 'AdjCovar', 'NoCovar') fitvar = grep(paste0(":", ad.var), rownames(cfit)) resdf = rbind(resdf, cfit[fitvar,, drop=F]) } resdf = resdf[order(resdf$p),] resdf$ad.var = ad.var resdf$p.adj = p.adjust(resdf$p, 'BH') resdf$col = ifelse(resdf$p.adj < 0.05, ifelse(resdf$Est > 0, 'Up', 'Down'), 'NS') cvintdf = rbind(resdf, cvintdf) }