#!/usr/bin/R # ------------------------------------------ # Plot the number of fusions, for revisions: # + statistics on distance between genes # Updated: 03/23/23 # ------------------------------------------ library(cbrbase) set_proj('DEVTRAJ', 'mosaicism') library(MASS) library(ggplot2) library(ggpubr) library(ComplexHeatmap) library(circlize) source(paste0(bindir, 'mosaicism/fusions_AD430/load_metadata.R')) source(paste0(bindir, 'multiRegion/auxiliary_plotting_settings.R')) # Directories: plotdir = paste0(imgdir, 'fusions/') imgpref = paste0(plotdir, 'AD430_fusions_') cmd = paste('mkdir -p', plotdir) system(cmd) # Load in the gene fusion data: # ----------------------------- source(paste0(bindir, 'mosaicism/fusions_AD430/load_fusioncalls.R')) source(paste0(bindir, 'mosaicism/genefusions/auxiliary_fusion_plotting.R')) # Plot the number of fusions overall: mx = 20 nfdf = sub.ctdf[grep("^SM_171013", sub.ctdf$dataset, invert=TRUE),] # Remove batch (v2) nfdf$count[nfdf$count > mx] = mx nfdf = aggregate(barcode ~ count, nfdf, length) gp = ggplot(nfdf, aes(count, barcode)) + geom_bar(fill='grey75', stat='identity', width=.90) + geom_text(data=nfdf, aes(count, barcode + 10, label=barcode), adj=-.1, angle=90, cex=3.5) + scale_y_continuous(labels=scales::comma, expand=expansion(mult=c(0, .2))) + scale_x_continuous(expand=c(0, 0)) + theme_pubr() + labs(x='Number of chimeric reads', y='Number of cells') pltprefix = paste0(imgpref, 'nfusions_barplot') saveGGplot(gp, pltprefix, w=2.25, h=2.25) # Statistics on gene distance: # ---------------------------- # Get order of genes, by TSS: gencode_version = 'v28.primary_assembly' anno = read.delim(paste0('Annotation/Tss.', gencode_version, '.tsv'), header=F, stringsAsFactors=F) names(anno) = c('chr', 'TSS', 'gene', 'type', 'symbol') gene.types = unique(anno$type) keep.types = c('protein_coding', 'antisense', 'lincRNA', gene.types[grep('pseudogene', gene.types)]) # Filter down TSS table, get gene ordering: anno = anno[anno$type %in% keep.types,] anno = anno[grep('^chr', anno$chr),] gdf = aggregate(TSS ~ chr + symbol, anno, min) gdf = gdf[order(gdf$TSS),] gdf = gdf[order(gdf$chr),] gdf$gene.ind = 1:nrow(gdf) kept.genes = unique(gdf$symbol) flag.genes = table(gdf$symbol) # Remove the chrY gene for duplicated chrX/chrY flag.genes = names(flag.genes)[flag.genes > 1] gdf = gdf[!((gdf$symbol %in% flag.genes) & (gdf$chr == 'chrY')),] rownames(gdf) = gdf$symbol # Annotate calls with number of intervening genes: # ------------------------------------------------ # Get filtered calls: filtfile = paste0(fsdir, prefix, '.filtered.nonmerge.tsv') full.filt.fsdf = read.delim(filtfile, header=T) # Keep only genes in annotation: filt.fsdf = full.filt.fsdf[(full.filt.fsdf$LeftSymbol %in% kept.genes) & (full.filt.fsdf$RightSymbol %in% kept.genes),] filt.genes = unique(c(filt.fsdf$LeftSymbol, filt.fsdf$RightSymbol)) # Add the gene index: filt.fsdf$LeftInd = gdf[filt.fsdf$LeftSymbol, 'gene.ind'] filt.fsdf$RightInd = gdf[filt.fsdf$RightSymbol, 'gene.ind'] # Aggregate and plot index distances: # ----------------------------------- mx = 20 intra.fsdf = filt.fsdf[filt.fsdf$fs.set == 'INTRA',] intra.fsdf$ind.dist = with(intra.fsdf, abs(RightInd - LeftInd) - 1) intra.fsdf$ind.dist[intra.fsdf$ind.dist > mx] = mx ind.df = aggregate(fsid ~ ind.dist, intra.fsdf, length) gp = ggplot(ind.df, aes(ind.dist, fsid)) + geom_bar(fill='grey75', stat='identity', width=.90) + geom_text(data=ind.df, aes(ind.dist, fsid + 10, label=fsid), adj=-.1, angle=90, cex=3.5) + scale_y_continuous(labels=scales::comma, expand=expansion(mult=c(0, .1))) + scale_x_continuous(expand=c(0, 0)) + theme_pubr() + labs(x='Number of intervening genes', y='Number of fusion events') pltprefix = paste0(imgpref, 'inddist_barplot') saveGGplot(gp, pltprefix, w=4, h=2.25) # Add the SpliceType variable: ind.df = aggregate(fsid ~ ind.dist + SpliceType, intra.fsdf, length) gp = ggplot(ind.df, aes(ind.dist, fsid, fill=SpliceType)) + geom_bar(position='fill', stat='identity', width=.90) + # geom_text(data=ind.df, aes(ind.dist, fsid + 10, label=fsid), adj=-.1, angle=90, cex=3.5) + scale_y_continuous(labels=scales::comma, expand=expansion(mult=c(0, .1))) + scale_x_continuous(expand=c(0, 0)) + theme_pubr() + labs(x='Number of intervening genes', y='Number of fusion events') pltprefix = paste0(imgpref, 'inddist_splice_barplot') saveGGplot(gp, pltprefix, w=4, h=2.25) # Plot the distance distribution itself: # -------------------------------------- gp = ggplot(intra.fsdf, aes(Distance / 1e6)) + geom_histogram(fill='grey75') + scale_y_continuous(labels=scales::comma, expand=expansion(mult=c(0, .1))) + scale_x_continuous(labels=scales::comma, expand=c(0, 0)) + theme_pubr() + labs(x='Distance between breakpoints (Mb)', y='Number of fusion events') pltprefix = paste0(imgpref, 'distance_histogram') saveGGplot(gp, pltprefix, w=4, h=2.25) dist.cap = 5e6 gp = ggplot(intra.fsdf[intra.fsdf$Distance < dist.cap,], aes(Distance / 1e6)) + geom_histogram(bins=50, fill='grey75') + scale_y_continuous(labels=scales::comma, expand=expansion(mult=c(0, .1))) + scale_x_continuous(labels=scales::comma, expand=c(0, 0)) + theme_pubr() + labs(x='Distance between breakpoints (Mb)', y='Number of fusion events') pltprefix = paste0(imgpref, 'distance_capped_histogram') saveGGplot(gp, pltprefix, w=4, h=2.25) intra.fsdf$chr = sub(":.*","", intra.fsdf$RightBreakpoint) intra.fsdf$chr = factor(intra.fsdf$chr, levels = paste0('chr', c(1:22, 'X', 'Y'))) gp = ggplot(intra.fsdf, aes(Distance / 1e6)) + facet_wrap(~chr) + geom_histogram(fill='grey75') + scale_y_continuous(labels=scales::comma, expand=expansion(mult=c(0, .1))) + scale_x_continuous(labels=scales::comma, expand=c(0, 0)) + theme_pubr() + labs(x='Distance between breakpoints (Mb)', y='Number of fusion events') pltprefix = paste0(imgpref, 'distance_histogram_by_chr') saveGGplot(gp, pltprefix, w=10, h=7) sum(intra.fsdf$Distance < 25000) / nrow(intra.fsdf) # Plot additional statistics for revision: # ---------------------------------------- # Number of fusions with only reference splice sites: stdf = aggregate(fsid ~ SpliceType + fs.set, filt.fsdf, length) pltmat = pivot.tomatrix(stdf, 'fs.set', 'fsid') ux = 1.5 ht = Heatmap(pltmat, col=col1, name='# Fusions', use_raster=FALSE, width=ncol(pltmat) * unit(2 * ux, 'mm'), height=nrow(pltmat) * unit(ux, 'mm'), border_gp=gpar(lwd=.5, color='black'), row_dend_width = unit(ux / 2, "mm"), column_dend_height = unit(ux /2 , "mm"), cell_fun = function(j, i, x, y, w, h, col){ grid.text(pltmat[i,j], x, y)}) h = 2 + nrow(pltmat) / 15 w = 2 + 2 * ncol(pltmat) / 15 pltprefix = paste0(imgpref, 'splicetype_vs_fusiontype_heatmap') saveHeatmap(ht, pltprefix, w=w, h=h) fisher.test(pltmat) sweep(pltmat, 2, colSums(pltmat), '/') gp = ggplot(stdf, aes(fs.set, fsid, fill=SpliceType)) + geom_bar(stat='identity', position='fill') + scale_y_continuous(labels=scales::comma, expand=expansion(mult=c(0, .1))) + theme_pubr() + labs(x='Breakpoint types', y='Number of fusion events') pltprefix = paste0(imgpref, 'splicetype_barplot') saveGGplot(gp, pltprefix, w=2.25, h=2.25)