#!/usr/bin/R
# ------------------------------------------------
# Plot the number of gene fusions (for revision)
# Also plot the number of intervening genes
# Updated: 03/24/23
# ------------------------------------------------
library(cbrbase)
set_proj('DEVTRAJ')

# Libraries:
library(tidyr)
library(ggplot2)
library(MASS)
library(ggpubr)
library(RColorBrewer)
options(width=170)

# Load in and process data (saves to matrices):
# project = 'ALZ_repl'
project = 'ALZ'
commandArgs <- function(){ c(project)}
source(paste0(bindir, 'mosaicism/load_metadata.R'))


# Make directories for analysis:
analysis = 'fusion'
imgdir = paste0(img, project, '/', analysis , '/')
imgpref = paste0(imgdir, analysis, "_mspred_", project, '_')
cmd = paste0('mkdir -p ', imgdir)
system(cmd)


# Load in the two cohorts of candidate gene fusion counts:
# --------------------------------------------------------
source(paste0(bindir, 'mosaicism/genefusions/load_fusion_data.R'))
source(paste0(bindir, 'mosaicism/genefusions/auxiliary_fusion_plotting.R'))


# Plot the number of fusions overall:
mx = 20
nfdf = ctdf[ctdf$cohort == 'ALZ',]
nfdf$count[nfdf$count > mx] = mx
nfdf = aggregate(sample ~ count, nfdf, length)

gp = ggplot(nfdf, aes(count, sample)) + 
    geom_bar(fill='grey75', stat='identity', width=.90) + 
    geom_text(data=nfdf, aes(count, sample + 10, label=sample), adj=-.1, angle=90, cex=3.5) + 
    scale_y_continuous(labels=scales::comma, expand=expansion(mult=c(0, .3))) +
    scale_x_continuous(expand=c(0, 0)) +
    theme_pubr() + labs(x='Number of chimeric reads', y='Number of cells')
pltprefix = paste0(imgpref, 'nfusions_barplot')
saveGGplot(gp, pltprefix, w=4, h=2.25)


# Statistics on gene distance:
# ----------------------------
# Get order of genes, by TSS:
gencode_version = 'v28.primary_assembly'
anno = read.delim(paste0('Annotation/Tss.', gencode_version, '.tsv'), header=F, stringsAsFactors=F)
names(anno) = c('chr', 'TSS', 'gene', 'type', 'symbol')
gene.types = unique(anno$type)
keep.types = c('protein_coding', 'antisense', 'lincRNA', gene.types[grep('pseudogene', gene.types)])

# Filter down TSS table, get gene ordering:
anno = anno[anno$type %in% keep.types,]
anno = anno[grep('^chr', anno$chr),]
gdf = aggregate(TSS ~ chr + symbol, anno, min)
gdf = gdf[order(gdf$TSS),]
gdf = gdf[order(gdf$chr),]
gdf$gene.ind = 1:nrow(gdf)
kept.genes = unique(gdf$symbol)
flag.genes = table(gdf$symbol)  # Remove the chrY gene for duplicated chrX/chrY
flag.genes = names(flag.genes)[flag.genes > 1]
gdf = gdf[!((gdf$symbol %in% flag.genes) & (gdf$chr == 'chrY')),]
rownames(gdf) = gdf$symbol


# Annotate calls with number of intervening genes:
# ------------------------------------------------
full.filt.fsdf = fsdf  # Save separately

# Keep only genes in annotation:
filt.fsdf = fsdf[(fsdf$LeftSymbol %in% kept.genes) & 
    (fsdf$RightSymbol %in% kept.genes),]
filt.genes = unique(c(filt.fsdf$LeftSymbol, filt.fsdf$RightSymbol))

# Add the gene index:
filt.fsdf$LeftInd = gdf[filt.fsdf$LeftSymbol, 'gene.ind']
filt.fsdf$RightInd = gdf[filt.fsdf$RightSymbol, 'gene.ind']

# Aggregate and plot index distances:
# -----------------------------------
mx = 20
intra.fsdf = filt.fsdf[filt.fsdf$fs.set == 'INTRA',]
intra.fsdf$ind.dist = with(intra.fsdf, abs(RightInd - LeftInd) - 1)
intra.fsdf$ind.dist[intra.fsdf$ind.dist > mx] = mx
ind.df = aggregate(sample ~ ind.dist, intra.fsdf, length)

gp = ggplot(ind.df, aes(ind.dist, sample)) + 
    geom_bar(fill='grey75', stat='identity', width=.90) + 
    geom_text(data=ind.df, aes(ind.dist, sample + 10, label=sample), adj=-.1, angle=90, cex=3.5) + 
    scale_y_continuous(labels=scales::comma, expand=expansion(mult=c(0, .1))) +
    scale_x_continuous(expand=c(0, 0)) +
    theme_pubr() + labs(x='Number of intervening genes', y='Number of fusion events')
pltprefix = paste0(imgpref, 'inddist_barplot')
saveGGplot(gp, pltprefix, w=4, h=2.25)

table(intra.fsdf$ind.dist)[1] / nrow(intra.fsdf)