#!/usr/bin/R
# -----------------------------------
# Simple cross-cohort analysis
# Updated: 01/26/22
# -----------------------------------
library(cbrbase)
set_proj('DEVTRAJ')

# Libraries:
library(tidyr)
library(ggplot2)
library(ggpubr)
library(RColorBrewer)

# Load in and process data (saves to matrices):
# project = 'ALZ_repl'
project = 'ALZ'
commandArgs <- function(){ c(project)}
source(paste0(bindir, 'mosaicism/load_metadata.R'))


# Make directories for analysis:
analysis = 'fusion'
imgdir = paste0(img, project, '/', analysis , '/')
imgpref = paste0(imgdir, analysis, "_mspred_", project, '_')
cmd = paste0('mkdir -p ', imgdir)
system(cmd)


# Load in the two cohorts of candidate gene fusion counts:
# --------------------------------------------------------
fs1dir = paste0(dbdir,'variants/', 'ALZ', '/fusion/')
fs2dir = paste0(dbdir,'variants/', 'ALZ_repl', '/fusion/')

filesuff = 'maxsensitivity_prediction_numbers_nochrM'
o1file = paste0(fs1dir, 'ALZ', '_', filesuff, '_wmeta.tsv')
o2file = paste0(fs2dir, 'ALZ_repl', '_', filesuff, '_wmeta.tsv')
ct1df = read.delim(o1file, header=T)
ct2df = read.delim(o2file, header=T)
ct1df$cohort = 'ALZ'
ct2df$cohort = 'ALZ_repl'

ctdf = rbind(ct1df, ct2df)
# NOTE: Signal is there on 10k + full, but PMI effect is larger
sub.ctdf = ctdf[ctdf$n_counts >= 25000,]


# Run regressions to estimate the effect of AD / non-AD:
# ------------------------------------------------------
processLmFit = function(fit){
    cfit = coefficients(summary(fit))
    cfit = data.frame(cfit)
    names(cfit) = c('Est','SE','t','p')
    cfit$coef = rownames(cfit)
    return(cfit)
}

advars = c('niareagansc','nrad','braaksc','bd', 'bd2','cogdx.ad')
covars = c('age_death','msex','pmi', 'Apoe_e4')
test.vars= c('cogdx.ad','bd','bd2','nrad', covars)
resdf = c()
ext.covar = '+ pmi + msex + Apoe_e4 + age_death + cohort'
only.exc = TRUE
# only.exc = FALSE
for (var in test.vars){
    # n_counts better models than log(n_counts)
    form = asform(c('count ~ ', var, '+ n_counts + relabel', ext.covar))
    form.ct = asform(c('count ~ ', var, '+ n_counts ', ext.covar))
    if (only.exc){
        fit = lm(form.ct, sub.ctdf[sub.ctdf$relabel == 'Excitatory',])
    } else {
        fit = lm(form, sub.ctdf)
    }
    cfit = processLmFit(fit)
    cfit$var = var
    resdf = rbind(resdf, cfit[2,])
}
resdf = resdf[order(resdf$p),]


resdf$col = with(resdf, ifelse(p < 0.05, ifelse(Est < 0, 1, 2), 0))
resdf = resdf[order(resdf$Est),]
resdf$coef = factor(resdf$coef, levels=resdf$coef)
pcols = brewer.pal(12, 'Paired')

devals = c('0'='grey80', '1'=pcols[2], '2'=pcols[6])
gp = ggplot(resdf, aes(coef, Est, label=p, fill=factor(col))) + 
    geom_bar(stat='identity') + 
    scale_fill_manual(values=devals) + 
    theme_pubr()

gp = ggplot(resdf, aes(coef, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + 
    geom_point() + 
    geom_errorbar() + 
    geom_hline(yintercept=0, lty='dotted') +
    scale_color_manual(values=devals) + 
    theme_pubr() + coord_flip() + 
    theme(legend.position='none')


# Plots of candidate counts:
# --------------------------
# TODO: plots split by cohort
# Plot the number of candidates vs. the number of reads:
gp = ggplot(sub.ctdf, aes(n_counts, count, color=cogdx.ad)) + 
    facet_wrap(~relabel, scale='free') + 
    geom_point() + 
    geom_smooth(method='lm') + 
    stat_cor(color='black', label.y.npc=1) + 
    stat_cor(label.y.npc=0.8) + 
    scale_x_log10() + 
    scale_color_manual(values=colvals[['cogdx.ad']], name='AD Cognition') + 
    theme_pubr()

# Plot the number of candidates vs. the number of reads:
gp = ggplot(sub.ctdf, aes(n_counts, count, color=relabel)) + 
    geom_point() + 
    geom_smooth(method='lm') + 
    stat_cor(color='black', label.y.npc=1) + 
    stat_cor(label.y.npc=0.8) + 
    scale_x_log10() + 
    theme_pubr()

# Plot the number of candidates vs. the number of reads (with BRAAK:
gp = ggplot(sub.ctdf, aes(n_counts, count, color=bd)) + 
    facet_wrap(~relabel, scale='free') + 
    geom_point() + 
    geom_smooth(method='lm') + 
    stat_cor(color='black', label.y.npc=1) + 
    stat_cor(label.y.npc=0.8) + 
    scale_x_log10() + 
    scale_color_manual(values=c('royalblue','indianred'), name='Braak Stage') + 
    theme_pubr()

# Plot the number of candidates vs. the read count bins:
gp = ggplot(ctdf, aes(lng.bin, count, fill=cogdx.ad)) + 
    facet_wrap(~relabel, scale='free') + 
    geom_boxplot() + 
    stat_compare_means() + 
    scale_fill_manual(values=colvals[['cogdx.ad']], name='AD Cognition') + 
    theme_pubr()

# Also plot with normalized count / n_counts:
gp = ggplot(ctdf, aes(lng.bin, count / n_counts, fill=cogdx.ad)) + 
    facet_wrap(~relabel, scale='free') + 
    geom_boxplot() + 
    stat_compare_means() + 
    scale_fill_manual(values=colvals[['cogdx.ad']], name='AD Cognition') + 
    theme_pubr()


# NOTE: the high effect size of PMI for these may be due to increased MT / RP fusions (artifactual, with paralogs/homologs) --> these are filtered out in the filtering steps. --> YES. FIXED.