#!/usr/bin/R # ------------------------------------------------ # Plot the number of gene fusions (for revision) # Also plot the number of intervening genes # Updated: 03/24/23 # ------------------------------------------------ library(cbrbase) set_proj('DEVTRAJ') # Libraries: library(tidyr) library(ggplot2) library(MASS) library(ggpubr) library(RColorBrewer) options(width=170) # Load in and process data (saves to matrices): # project = 'ALZ_repl' project = 'ALZ' commandArgs <- function(){ c(project)} source(paste0(bindir, 'mosaicism/load_metadata.R')) # Make directories for analysis: analysis = 'fusion' imgdir = paste0(img, project, '/', analysis , '/') imgpref = paste0(imgdir, analysis, "_mspred_", project, '_') cmd = paste0('mkdir -p ', imgdir) system(cmd) # Load in the two cohorts of candidate gene fusion counts: # -------------------------------------------------------- source(paste0(bindir, 'mosaicism/genefusions/load_fusion_data.R')) source(paste0(bindir, 'mosaicism/genefusions/auxiliary_fusion_plotting.R')) # Plot the number of fusions overall: mx = 20 nfdf = ctdf[ctdf$cohort == 'ALZ',] nfdf$count[nfdf$count > mx] = mx nfdf = aggregate(sample ~ count, nfdf, length) gp = ggplot(nfdf, aes(count, sample)) + geom_bar(fill='grey75', stat='identity', width=.90) + geom_text(data=nfdf, aes(count, sample + 10, label=sample), adj=-.1, angle=90, cex=3.5) + scale_y_continuous(labels=scales::comma, expand=expansion(mult=c(0, .3))) + scale_x_continuous(expand=c(0, 0)) + theme_pubr() + labs(x='Number of chimeric reads', y='Number of cells') pltprefix = paste0(imgpref, 'nfusions_barplot') saveGGplot(gp, pltprefix, w=4, h=2.25) # Statistics on gene distance: # ---------------------------- # Get order of genes, by TSS: gencode_version = 'v28.primary_assembly' anno = read.delim(paste0('Annotation/Tss.', gencode_version, '.tsv'), header=F, stringsAsFactors=F) names(anno) = c('chr', 'TSS', 'gene', 'type', 'symbol') gene.types = unique(anno$type) keep.types = c('protein_coding', 'antisense', 'lincRNA', gene.types[grep('pseudogene', gene.types)]) # Filter down TSS table, get gene ordering: anno = anno[anno$type %in% keep.types,] anno = anno[grep('^chr', anno$chr),] gdf = aggregate(TSS ~ chr + symbol, anno, min) gdf = gdf[order(gdf$TSS),] gdf = gdf[order(gdf$chr),] gdf$gene.ind = 1:nrow(gdf) kept.genes = unique(gdf$symbol) flag.genes = table(gdf$symbol) # Remove the chrY gene for duplicated chrX/chrY flag.genes = names(flag.genes)[flag.genes > 1] gdf = gdf[!((gdf$symbol %in% flag.genes) & (gdf$chr == 'chrY')),] rownames(gdf) = gdf$symbol # Annotate calls with number of intervening genes: # ------------------------------------------------ full.filt.fsdf = fsdf # Save separately # Keep only genes in annotation: filt.fsdf = fsdf[(fsdf$LeftSymbol %in% kept.genes) & (fsdf$RightSymbol %in% kept.genes),] filt.genes = unique(c(filt.fsdf$LeftSymbol, filt.fsdf$RightSymbol)) # Add the gene index: filt.fsdf$LeftInd = gdf[filt.fsdf$LeftSymbol, 'gene.ind'] filt.fsdf$RightInd = gdf[filt.fsdf$RightSymbol, 'gene.ind'] # Aggregate and plot index distances: # ----------------------------------- mx = 20 intra.fsdf = filt.fsdf[filt.fsdf$fs.set == 'INTRA',] intra.fsdf$ind.dist = with(intra.fsdf, abs(RightInd - LeftInd) - 1) intra.fsdf$ind.dist[intra.fsdf$ind.dist > mx] = mx ind.df = aggregate(sample ~ ind.dist, intra.fsdf, length) gp = ggplot(ind.df, aes(ind.dist, sample)) + geom_bar(fill='grey75', stat='identity', width=.90) + geom_text(data=ind.df, aes(ind.dist, sample + 10, label=sample), adj=-.1, angle=90, cex=3.5) + scale_y_continuous(labels=scales::comma, expand=expansion(mult=c(0, .1))) + scale_x_continuous(expand=c(0, 0)) + theme_pubr() + labs(x='Number of intervening genes', y='Number of fusion events') pltprefix = paste0(imgpref, 'inddist_barplot') saveGGplot(gp, pltprefix, w=4, h=2.25) table(intra.fsdf$ind.dist)[1] / nrow(intra.fsdf)