#!/usr/bin/R
# ----------------------------------------------
# Plot statistics on gene fusion data
# Updated: 01/25/22
# ----------------------------------------------
library(cbrbase)
set_proj('DEVTRAJ')

# Libraries:
library(tidyr)
library(ggplot2)
library(ggpubr)
library(RColorBrewer)

# Load in and process data (saves to matrices):
# project = 'ALZ_repl'
project = 'ALZ'
commandArgs <- function(){ c(project)}
source(paste0(bindir, 'mosaicism/load_metadata.R'))


# Make directories for analysis:
analysis = 'fusion'
imgdir = paste0(img, project, '/', analysis , '/')
imgpref = paste0(imgdir, analysis, "_mspred_", project, '_')
cmd = paste0('mkdir -p ', imgdir)
system(cmd)

source(paste0(bindir, 'mosaicism/genefusions/auxiliary_fusion_plotting.R'))


# Load in the two cohorts of candidate gene fusion counts:
# --------------------------------------------------------
outpref = paste0(dbdir,'variants/ALZ_joint_fusion_datasets_')
filesuff = 'maxsensitivity_prediction_numbers_wmeta'
fssuff = 'maxsensitivity_abridged_fusions_filtered'
count.tsv = paste0(outpref, filesuff, '.tsv')
fusion.tsv = paste0(outpref, fssuff, '.tsv')

ctdf = read.delim(count.tsv, header=T)
fsdf = read.delim(fusion.tsv, header=T)


# Plot hits distribution:
# -----------------------
# Distribution by cohort:
gp = ggplot(ctdf, aes(count, fill=cohort)) + 
    geom_histogram() + 
    scale_fill_manual(values=cohvals, name='Cohort:') + 
    scale_y_continuous(labels=scales::comma, expand=c(0,0)) + 
    labs(x='Number of fusion-candidate reads', y='Number of cells') +
    theme_pubr() + theme(legend.position=c(.8, .8)) 

pltprefix = paste0(imgpref, 'stats_distribution_cohort')
w = 6; h = 3
ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h)
ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h)

# Distribution by platform:
gp = ggplot(ctdf, aes(count, fill=platform)) + 
    geom_histogram() + 
    scale_y_continuous(labels=scales::comma, expand=c(0,0)) + 
    labs(x='Number of fusion-candidate reads', y='Number of cells') +
    theme_pubr() + theme(legend.position=c(.8, .8)) 

pltprefix = paste0(imgpref, 'stats_distribution_platform')
w = 6; h = 3
ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h)
ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h)


# Distribution by AD status:
gp = ggplot(ctdf, aes(count, color=cogdx.ad)) + 
    facet_wrap(~cohort) + 
    geom_density() + 
    scale_color_manual(values=colvals[['cogdx.ad']], name='AD cognition:') + 
    scale_y_continuous(labels=scales::comma, expand=c(0,0)) + 
    labs(x='Number of fusion-candidate reads', y='Density of cells') +
    theme_pubr() + theme(legend.position=c(.8, .6)) 

pltprefix = paste0(imgpref, 'stats_distribution_cogdx')
w = 6; h = 3
ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h)
ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h)


# Cell type violins:
gp = ggplot(ctdf[ctdf$n_counts > 10000,], aes(relabel, count, fill=relabel)) + 
    facet_wrap(~cohort) + 
    stat_compare_means() + 
    geom_violin(scale='width') + 
    geom_boxplot(width=.25) + 
    scale_fill_manual(values=relabel.col, name='Cell type:') + 
    labs(x='Cell type', y='Number of fusion-candidate reads (cells > 10k reads)') +
    theme_pubr() + theme(legend.position='none') + 
    theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) 

pltprefix = paste0(imgpref, 'stats_distribution_celltypes')
w = 7; h = 6
ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h)
ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h)


# Plot raw counts + % of cells w/ at the individual level:
# --------------------------------------------------------
sub.ctdf = ctdf[ctdf$n_counts > 25000,]

aggdf = aggregate(count1 ~ projid + cogdx.ad + cohort + platform, sub.ctdf, mean)
gp = ggplot(aggdf, aes(platform, count1, color=cogdx.ad)) + 
    geom_boxplot(outlier.shape=NA) + 
    geom_jitter(position=position_jitterdodge(jitter.width = .25)) + 
    stat_compare_means(label='p.format') + 
    labs(x='Platform', y='% of individual\'s cells with a candidate fusion\n(of cells > 25k reads)') +
    scale_color_manual(values=colvals[['cogdx.ad']], name='AD cognition:') + 
    theme_pubr()
saveGGplot(gp, paste0(imgpref, 'stats_percent_fusion_ind_boxplot'), w=4, h=5)

# Average number of counts:
aggdf = aggregate(count ~ projid + cogdx.ad + cohort + platform, sub.ctdf, mean)
gp = ggplot(aggdf, aes(platform, count, color=cogdx.ad)) + 
    geom_boxplot(outlier.shape=NA) + 
    geom_jitter(position=position_jitterdodge(jitter.width = .25)) + 
    stat_compare_means(label='p.format') + 
    labs(x='Platform', y='Average number of candidate fusions in\nindividual\'s cells (of cells > 25k reads)') +
    scale_color_manual(values=colvals[['cogdx.ad']], name='AD cognition:') + 
    theme_pubr()
saveGGplot(gp, paste0(imgpref, 'stats_averagenum_fusion_ind_boxplot'), w=4, h=5)


# Plot by distance:
# -----------------
# Compare to chromosome size - are they all equally spread?
fsdf$RightChr = sapply(fsdf$RightBreakpoint, function(x){ strsplit(x, ':')[[1]][1]})
intra.fsdf = fsdf[fsdf$fs.set == 'INTRA',]
gp = ggplot(intra.fsdf, aes(Distance, color=platform)) + 
    geom_density() + 
    labs(x='Distance for candidate intra-chromosomal fusion', y='Density') +
    scale_x_continuous(expand=c(0,0), labels=scales::comma) + 
    scale_y_continuous(expand=c(0,0)) + 
    theme_pubr()
saveGGplot(gp, paste0(imgpref, 'stats_intradistance_density'), w=7, h=4)

close.fsdf = intra.fsdf[intra.fsdf$Distance < 5e6,]
gp = ggplot(close.fsdf, aes(Distance, color=platform)) + 
    # facet_wrap(~RightChr) + 
    # geom_histogram() + 
    geom_density(adjust=.25) + 
    labs(x='Distance for candidate close intra-chr fusion', y='Density') +
    scale_x_continuous(expand=c(0,0), labels=scales::comma) + 
    scale_y_continuous(expand=c(0,0)) + 
    theme_pubr()
saveGGplot(gp, paste0(imgpref, 'stats_intradistance_close_density'), w=7, h=4)

gp = ggplot(close.fsdf, aes(Distance, fill=platform)) + 
    geom_histogram() + 
    labs(x='Distance for candidate close intra-chr fusion', y='# of candidates') +
    scale_x_continuous(expand=c(0,0), labels=scales::comma) + 
    scale_y_continuous(expand=c(0,0), labels=scales::comma) + 
    theme_pubr()
saveGGplot(gp, paste0(imgpref, 'stats_intradistance_close_histogram'), w=7, h=4)

gp = ggplot(close.fsdf[close.fsdf$Distance < 1.5e6,], aes(Distance, fill=platform)) + 
    geom_histogram(bins=50) + 
    labs(x='Distance for candidate very close intra-chr fusion', y='# of candidates') +
    scale_x_continuous(expand=c(0,0), labels=scales::comma) + 
    scale_y_continuous(expand=c(0,0)) + 
    theme_pubr()
saveGGplot(gp, paste0(imgpref, 'stats_intradistance_superclose_histogram'), w=7, h=4)

# Distance by cognition:
close.fsdf = merge(close.fsdf, unique(ctdf[,c('projid','cogdx.ad','bd')]))
gp = ggplot(close.fsdf[close.fsdf$Distance < 1.5e6,], aes(Distance, fill=cogdx.ad)) + 
    facet_wrap(~cohort) + 
    geom_histogram(position='dodge') + 
    # geom_density() + 
    labs(x='Distance for candidate close intra-chr fusion', y='Density') +
    scale_x_continuous(expand=c(0,0), labels=scales::comma) + 
    scale_fill_manual(values=colvals[['cogdx.ad']], name='AD cognition:') + 
    scale_y_continuous(expand=c(0,0)) + 
    theme_pubr()
saveGGplot(gp, paste0(imgpref, 'stats_intradistance_close_cognition_density'), w=9, h=4)


# Scatter plots of hits vs. counts:
# ---------------------------------
# Scatter for cohort (ncounts):
gp = ggplot(ctdf, aes(n_counts, count, color=cohort)) + 
    geom_point(cex=.5) + 
    geom_smooth(method='lm') + 
    stat_cor(label.y.npc=0.95, label.x.npc=0.2) + 
    scale_color_manual(values=cohvals, name='Cohort:') + 
    scale_x_continuous(labels=scales::comma, expand=c(0,0)) + 
    scale_y_continuous(labels=scales::comma, expand=c(0,0)) + 
    labs(x='Number of reads', y='Number of fusion-candidate reads') + 
    theme_pubr() + theme(legend.position=c(.8, .7)) 

pltprefix = paste0(imgpref, 'stats_scatter_ncounts_cohort')
w = 5; h = 5
ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h)
ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h)


# Scatter for cohort (ngenes):
gp = ggplot(ctdf, aes(n_genes, count, color=cohort)) + 
    geom_point(cex=.5) + 
    geom_smooth(method='lm') + 
    stat_cor(label.y.npc=0.95, label.x.npc=0.2) + 
    scale_color_manual(values=cohvals, name='Cohort:') + 
    scale_x_continuous(labels=scales::comma, expand=c(0,0)) + 
    scale_y_continuous(labels=scales::comma, expand=c(0,0)) + 
    labs(x='Number of genes', y='Number of fusion-candidate reads') +
    theme_pubr() + theme(legend.position=c(.8, .7)) 

pltprefix = paste0(imgpref, 'stats_scatter_ngenes_cohort')
w = 5; h = 5
ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h)
ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h)


# Scatter for platform (ncounts):
gp = ggplot(ctdf, aes(n_counts, count, color=platform)) + 
    geom_point(cex=.5) + 
    geom_smooth(method='lm') + 
    stat_cor(label.y.npc=0.95, label.x.npc=0.2) + 
    scale_x_continuous(labels=scales::comma, expand=c(0,0)) + 
    scale_y_continuous(labels=scales::comma, expand=c(0,0)) + 
    labs(x='Number of reads', y='Number of fusion-candidate reads') + 
    theme_pubr() + theme(legend.position=c(.8, .7)) 

pltprefix = paste0(imgpref, 'stats_scatter_ncounts_platform')
w = 5; h = 5
ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h)
ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h)


# Scatter for cognition (ncounts)
gp = ggplot(ctdf, aes(n_counts, count, color=cogdx.ad)) + 
    facet_wrap(~cohort) + 
    geom_point(cex=.5) + 
    geom_smooth(method='lm') + 
    stat_cor(label.x.npc=0.05) + 
    scale_color_manual(values=colvals[['cogdx.ad']], name='AD cognition:') + 
    scale_x_continuous(labels=scales::comma, expand=c(0,0)) + 
    scale_y_continuous(labels=scales::comma, expand=c(0,0)) + 
    labs(x='Number of counts', y='Number of fusion-candidate reads') +
    theme_pubr() + theme(legend.position=c(.8, .7)) 

pltprefix = paste0(imgpref, 'stats_scatter_ncounts_cognition')
w = 7.5; h = 4
ggsave(paste0(pltprefix,'.png'), gp, dpi=400, units='in', width=w, height=h)
ggsave(paste0(pltprefix, '.pdf'), gp, dpi=400, units='in', width=w, height=h)