#!/usr/bin/R
# ----------------------------------------------
# Join the fusions with the metadata:
# Updated: 01/26/22
# ----------------------------------------------
library(cbrbase)
set_proj('DEVTRAJ')

# Libraries:
library(tidyr)
library(ggplot2)
library(ggpubr)

# Load in and process data (saves to matrices):
project = 'ALZ'
# project = 'ALZ_repl'
commandArgs <- function(){ c(project)}
source(paste0(bindir, 'mosaicism/load_metadata.R'))

use.prelim = FALSE


# Make directories for analysis:
analysis = 'fusion'
imgdir = paste0(img, project, '/', analysis , '/')
fsdir = paste0(dbdir,'variants/', project, '/fusion/')
if (use.prelim){
    imgpref = paste0(imgdir, analysis, "_prelimcts_", project, '_')
} else {
    imgpref = paste0(imgdir, analysis, "_mspred_", project, '_')
}
cmd = paste0('mkdir -p ', imgdir)
system(cmd)


# Update the metadata for merging with the fusions:
# -------------------------------------------------
meta = merge(meta, ext.meta[,c('projid','niareagansc', 'pmi')])
meta$bd = '1-4'
meta$bd[meta$braaksc %in% c('5','6')] = '5-6'
meta$bd2 = NA
meta$bd2[meta$braaksc %in% c('0','1','2')] = 'CTRL'
meta$bd2[meta$braaksc %in% c('5','6')] = 'AD'
meta$nrad = 'CTRL'
meta$nrad[meta$niareagansc %in% c('1','2')] = 'AD'
meta$Apoe_e4 = ifelse(meta$apoe_genotype %in% c('24','44','34'), 'Yes','No')
if (project == 'ALZ') { meta$plate = meta$projid }

# For plot against quality, bin the number of genes:
lng = log(meta$n_counts)
bks = seq(min(lng), max(lng), length.out=10)
meta$lng.bin = cut(lng, bks, include.lowest=T)

# Variable sets:
advars = c('niareagansc','nrad','braaksc','bd', 'bd2','cogdx.ad')
covars = c('age_death','msex','pmi', 'Apoe_e4')
cellvars = c('percent_mito','percent_ribo','count_per_gene',
    'n_counts','n_genes', 'lng.bin', 'relabel')


# Read in data of preliminary or predicted fusion counts:
# -------------------------------------------------------
if (use.prelim){
    filesuff = 'prelim_fusion_cts'
    ctdf = read.delim(paste0(fsdir,project, '_', filesuff, '.tsv'), header=F)
    names(ctdf) = c('count','plate','sample')
} else { 
    filesuff = 'maxsensitivity_prediction_numbers'
    fssuff = 'maxsensitivity_abridged_fusions_filtered'
    # Load list of all measured cells:
    ctdf = read.delim(paste0(fsdir,project, '_', filesuff, '.tsv'), header=F)
    names(ctdf) = c('count','plate','sample')
    ctdf$count = NULL

    # Reduce the filtered fragments and join
    fsdf = read.delim(paste0(fsdir,project, '_', fssuff, '.tsv'), header=T)
    colnames(fsdf)[colnames(fsdf) == 'projid'] = 'plate'

    # Aggregate raw counts:
    fsctdf = agg.rename(annots ~ plate + sample, fsdf, length, 'count')
    ctdf = merge(ctdf, fsctdf, all.x=TRUE)
    ctdf$count[is.na(ctdf$count)] = 0
}


# Merge the fusion counts with the metadata:
# ------------------------------------------
mcols = c('plate','sample', advars, covars, cellvars)
ctdf = merge(ctdf, unique(meta[,mcols]))
ctdf$count1 = 1 * (ctdf$count > 0)
ctdf$ind = 1

outfile = paste0(fsdir, project, '_', filesuff, '_wmeta.tsv')
write.table(ctdf, outfile, quote=F, sep='\t', row.names=F)

# NOTE: Not as reliable by counts under this threshold, regr may be ok
sub.ctdf = ctdf[ctdf$n_counts >= 25000,]


# Simple counts:
# --------------
twide = aggregate(cbind(count, ind) ~ cogdx.ad, ctdf, sum)
# twide = aggregate(cbind(count, ind) ~ bd, ctdf, sum)
# twide = aggregate(cbind(count, ind) ~ bd2, ctdf, sum)
# twide = aggregate(cbind(count, ind) ~ nrad, ctdf, sum)
tab = as.matrix(twide[,-1])
fisher.test(tab)


# Plot the number of candidates vs. the number of reads:
gp = ggplot(sub.ctdf, aes(n_counts, count, color=cogdx.ad)) + 
    facet_wrap(~relabel, scale='free') + 
    geom_point() + 
    geom_smooth(method='lm') + 
    stat_cor(color='black', label.y.npc=1) + 
    stat_cor(label.y.npc=0.8) + 
    scale_x_log10() + 
    scale_color_manual(values=colvals[['cogdx.ad']], name='AD Cognition') + 
    theme_pubr()


# Plot the number of candidates vs. the number of reads:
gp = ggplot(sub.ctdf, aes(n_counts, count, color=relabel)) + 
    geom_point() + 
    geom_smooth(method='lm') + 
    stat_cor(color='black', label.y.npc=1) + 
    stat_cor(label.y.npc=0.8) + 
    scale_x_log10() + 
    theme_pubr()


# Plot the number of candidates vs. the number of reads (with BRAAK:
gp = ggplot(sub.ctdf, aes(n_counts, count, color=bd)) + 
    facet_wrap(~relabel, scale='free') + 
    geom_point() + 
    geom_smooth(method='lm') + 
    stat_cor(color='black', label.y.npc=1) + 
    stat_cor(label.y.npc=0.8) + 
    scale_x_log10() + 
    scale_color_manual(values=c('royalblue','indianred'), name='Braak Stage') + 
    theme_pubr()

# Plot the number of candidates vs. the read count bins:
gp = ggplot(ctdf, aes(lng.bin, count, fill=cogdx.ad)) + 
    facet_wrap(~relabel, scale='free') + 
    geom_boxplot() + 
    stat_compare_means() + 
    scale_fill_manual(values=colvals[['cogdx.ad']], name='AD Cognition') + 
    theme_pubr()

# Also plot with normalized count / n_counts:
gp = ggplot(ctdf, aes(lng.bin, count / n_counts, fill=cogdx.ad)) + 
    facet_wrap(~relabel, scale='free') + 
    geom_boxplot() + 
    stat_compare_means() + 
    scale_fill_manual(values=colvals[['cogdx.ad']], name='AD Cognition') + 
    theme_pubr()


# Run regressions to estimate the effect of AD / non-AD:
# ------------------------------------------------------
processLmFit = function(fit){
    cfit = coefficients(summary(fit))
    cfit = data.frame(cfit)
    names(cfit) = c('Est','SE','t','p')
    cfit$coef = rownames(cfit)
    return(cfit)
}

test.vars= c('cogdx.ad','bd','bd2','nrad', covars, cellvars)
resdf = c()
ext.covar = '+ pmi + msex + Apoe_e4 + age_death + percent_mito + percent_ribo + count_per_gene'
for (var in test.vars){
    # n_counts better models than log(n_counts)
    form = asform(c('count ~ ', var, '+ n_counts + relabel', ext.covar))
    form.ct = asform(c('count ~ ', var, '+ n_counts ', ext.covar))
    fit = lm(form, sub.ctdf)
    # fit = lm(form.ct, sub.ctdf[sub.ctdf$relabel == 'Excitatory',])
    cfit = processLmFit(fit)
    cfit$var = var
    resdf = rbind(resdf, cfit[2,])
}
resdf = resdf[order(resdf$p),]

# NOTE: the high effect size of PMI for these may be due to increased MT / RP fusions (artifactual, with paralogs/homologs) --> these are filtered out in the filtering steps. --> YES. FIXED.