#!/usr/bin/R # ---------------------------------------------- # Plot statistics on gene fusion data # Updated: 01/25/22 # ---------------------------------------------- library(cbrbase) set_proj('DEVTRAJ') # Libraries: library(tidyr) library(ggplot2) library(ggpubr) library(RColorBrewer) # Load in and process data (saves to matrices): # project = 'ALZ_repl' project = 'ALZ' commandArgs <- function(){ c(project)} source(paste0(bindir, 'mosaicism/load_metadata.R')) # Make directories for analysis: analysis = 'fusion' imgdir = paste0(img, project, '/', analysis , '/') imgpref = paste0(imgdir, analysis, "_mspred_", project, '_') cmd = paste0('mkdir -p ', imgdir) system(cmd) source(paste0(bindir, 'mosaicism/genefusions/auxiliary_fusion_plotting.R')) # Load in the two cohorts of candidate gene fusion counts: # -------------------------------------------------------- outpref = paste0(dbdir,'variants/ALZ_joint_fusion_datasets_') filesuff = 'maxsensitivity_prediction_numbers_wmeta' fssuff = 'maxsensitivity_abridged_fusions_filtered' count.tsv = paste0(outpref, filesuff, '.tsv') fusion.tsv = paste0(outpref, fssuff, '.tsv') ctdf = read.delim(count.tsv, header=T) fsdf = read.delim(fusion.tsv, header=T) # Plot hits distribution: # ----------------------- # Distribution by cohort: gp = ggplot(ctdf, aes(count, fill=cohort)) + geom_histogram() + scale_fill_manual(values=cohvals, name='Cohort:') + scale_y_continuous(labels=scales::comma, expand=c(0,0)) + labs(x='Number of fusion-candidate reads', y='Number of cells') + theme_pubr() + theme(legend.position=c(.8, .8)) pltprefix = paste0(imgpref, 'stats_distribution_cohort') w = 6; h = 3 ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h) ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h) # Distribution by platform: gp = ggplot(ctdf, aes(count, fill=platform)) + geom_histogram() + scale_y_continuous(labels=scales::comma, expand=c(0,0)) + labs(x='Number of fusion-candidate reads', y='Number of cells') + theme_pubr() + theme(legend.position=c(.8, .8)) pltprefix = paste0(imgpref, 'stats_distribution_platform') w = 6; h = 3 ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h) ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h) # Distribution by AD status: gp = ggplot(ctdf, aes(count, color=cogdx.ad)) + facet_wrap(~cohort) + geom_density() + scale_color_manual(values=colvals[['cogdx.ad']], name='AD cognition:') + scale_y_continuous(labels=scales::comma, expand=c(0,0)) + labs(x='Number of fusion-candidate reads', y='Density of cells') + theme_pubr() + theme(legend.position=c(.8, .6)) pltprefix = paste0(imgpref, 'stats_distribution_cogdx') w = 6; h = 3 ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h) ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h) # Cell type violins: gp = ggplot(ctdf[ctdf$n_counts > 10000,], aes(relabel, count, fill=relabel)) + facet_wrap(~cohort) + stat_compare_means() + geom_violin(scale='width') + geom_boxplot(width=.25) + scale_fill_manual(values=relabel.col, name='Cell type:') + labs(x='Cell type', y='Number of fusion-candidate reads (cells > 10k reads)') + theme_pubr() + theme(legend.position='none') + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) pltprefix = paste0(imgpref, 'stats_distribution_celltypes') w = 7; h = 6 ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h) ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h) # Plot raw counts + % of cells w/ at the individual level: # -------------------------------------------------------- sub.ctdf = ctdf[ctdf$n_counts > 25000,] aggdf = aggregate(count1 ~ projid + cogdx.ad + cohort + platform, sub.ctdf, mean) gp = ggplot(aggdf, aes(platform, count1, color=cogdx.ad)) + geom_boxplot(outlier.shape=NA) + geom_jitter(position=position_jitterdodge(jitter.width = .25)) + stat_compare_means(label='p.format') + labs(x='Platform', y='% of individual\'s cells with a candidate fusion\n(of cells > 25k reads)') + scale_color_manual(values=colvals[['cogdx.ad']], name='AD cognition:') + theme_pubr() saveGGplot(gp, paste0(imgpref, 'stats_percent_fusion_ind_boxplot'), w=4, h=5) # Average number of counts: aggdf = aggregate(count ~ projid + cogdx.ad + cohort + platform, sub.ctdf, mean) gp = ggplot(aggdf, aes(platform, count, color=cogdx.ad)) + geom_boxplot(outlier.shape=NA) + geom_jitter(position=position_jitterdodge(jitter.width = .25)) + stat_compare_means(label='p.format') + labs(x='Platform', y='Average number of candidate fusions in\nindividual\'s cells (of cells > 25k reads)') + scale_color_manual(values=colvals[['cogdx.ad']], name='AD cognition:') + theme_pubr() saveGGplot(gp, paste0(imgpref, 'stats_averagenum_fusion_ind_boxplot'), w=4, h=5) # Plot by distance: # ----------------- # Compare to chromosome size - are they all equally spread? fsdf$RightChr = sapply(fsdf$RightBreakpoint, function(x){ strsplit(x, ':')[[1]][1]}) intra.fsdf = fsdf[fsdf$fs.set == 'INTRA',] gp = ggplot(intra.fsdf, aes(Distance, color=platform)) + geom_density() + labs(x='Distance for candidate intra-chromosomal fusion', y='Density') + scale_x_continuous(expand=c(0,0), labels=scales::comma) + scale_y_continuous(expand=c(0,0)) + theme_pubr() saveGGplot(gp, paste0(imgpref, 'stats_intradistance_density'), w=7, h=4) close.fsdf = intra.fsdf[intra.fsdf$Distance < 5e6,] gp = ggplot(close.fsdf, aes(Distance, color=platform)) + # facet_wrap(~RightChr) + # geom_histogram() + geom_density(adjust=.25) + labs(x='Distance for candidate close intra-chr fusion', y='Density') + scale_x_continuous(expand=c(0,0), labels=scales::comma) + scale_y_continuous(expand=c(0,0)) + theme_pubr() saveGGplot(gp, paste0(imgpref, 'stats_intradistance_close_density'), w=7, h=4) gp = ggplot(close.fsdf, aes(Distance, fill=platform)) + geom_histogram() + labs(x='Distance for candidate close intra-chr fusion', y='# of candidates') + scale_x_continuous(expand=c(0,0), labels=scales::comma) + scale_y_continuous(expand=c(0,0), labels=scales::comma) + theme_pubr() saveGGplot(gp, paste0(imgpref, 'stats_intradistance_close_histogram'), w=7, h=4) gp = ggplot(close.fsdf[close.fsdf$Distance < 1.5e6,], aes(Distance, fill=platform)) + geom_histogram(bins=50) + labs(x='Distance for candidate very close intra-chr fusion', y='# of candidates') + scale_x_continuous(expand=c(0,0), labels=scales::comma) + scale_y_continuous(expand=c(0,0)) + theme_pubr() saveGGplot(gp, paste0(imgpref, 'stats_intradistance_superclose_histogram'), w=7, h=4) # Distance by cognition: close.fsdf = merge(close.fsdf, unique(ctdf[,c('projid','cogdx.ad','bd')])) gp = ggplot(close.fsdf[close.fsdf$Distance < 1.5e6,], aes(Distance, fill=cogdx.ad)) + facet_wrap(~cohort) + geom_histogram(position='dodge') + # geom_density() + labs(x='Distance for candidate close intra-chr fusion', y='Density') + scale_x_continuous(expand=c(0,0), labels=scales::comma) + scale_fill_manual(values=colvals[['cogdx.ad']], name='AD cognition:') + scale_y_continuous(expand=c(0,0)) + theme_pubr() saveGGplot(gp, paste0(imgpref, 'stats_intradistance_close_cognition_density'), w=9, h=4) # Scatter plots of hits vs. counts: # --------------------------------- # Scatter for cohort (ncounts): gp = ggplot(ctdf, aes(n_counts, count, color=cohort)) + geom_point(cex=.5) + geom_smooth(method='lm') + stat_cor(label.y.npc=0.95, label.x.npc=0.2) + scale_color_manual(values=cohvals, name='Cohort:') + scale_x_continuous(labels=scales::comma, expand=c(0,0)) + scale_y_continuous(labels=scales::comma, expand=c(0,0)) + labs(x='Number of reads', y='Number of fusion-candidate reads') + theme_pubr() + theme(legend.position=c(.8, .7)) pltprefix = paste0(imgpref, 'stats_scatter_ncounts_cohort') w = 5; h = 5 ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h) ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h) # Scatter for cohort (ngenes): gp = ggplot(ctdf, aes(n_genes, count, color=cohort)) + geom_point(cex=.5) + geom_smooth(method='lm') + stat_cor(label.y.npc=0.95, label.x.npc=0.2) + scale_color_manual(values=cohvals, name='Cohort:') + scale_x_continuous(labels=scales::comma, expand=c(0,0)) + scale_y_continuous(labels=scales::comma, expand=c(0,0)) + labs(x='Number of genes', y='Number of fusion-candidate reads') + theme_pubr() + theme(legend.position=c(.8, .7)) pltprefix = paste0(imgpref, 'stats_scatter_ngenes_cohort') w = 5; h = 5 ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h) ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h) # Scatter for platform (ncounts): gp = ggplot(ctdf, aes(n_counts, count, color=platform)) + geom_point(cex=.5) + geom_smooth(method='lm') + stat_cor(label.y.npc=0.95, label.x.npc=0.2) + scale_x_continuous(labels=scales::comma, expand=c(0,0)) + scale_y_continuous(labels=scales::comma, expand=c(0,0)) + labs(x='Number of reads', y='Number of fusion-candidate reads') + theme_pubr() + theme(legend.position=c(.8, .7)) pltprefix = paste0(imgpref, 'stats_scatter_ncounts_platform') w = 5; h = 5 ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h) ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h) # Scatter for cognition (ncounts) gp = ggplot(ctdf, aes(n_counts, count, color=cogdx.ad)) + facet_wrap(~cohort) + geom_point(cex=.5) + geom_smooth(method='lm') + stat_cor(label.x.npc=0.05) + scale_color_manual(values=colvals[['cogdx.ad']], name='AD cognition:') + scale_x_continuous(labels=scales::comma, expand=c(0,0)) + scale_y_continuous(labels=scales::comma, expand=c(0,0)) + labs(x='Number of counts', y='Number of fusion-candidate reads') + theme_pubr() + theme(legend.position=c(.8, .7)) pltprefix = paste0(imgpref, 'stats_scatter_ncounts_cognition') w = 7.5; h = 4 ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h) ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h)