#!/usr/bin/R
# ------------------------------------------------
# Estimate effect sizes for gene candidate fusions
# Updated: 02/21/23 to update regression model
# ------------------------------------------------
library(cbrbase)
set_proj('DEVTRAJ')

# Libraries:
library(tidyr)
library(MASS)
library(ggplot2)
library(ggpubr)
library(RColorBrewer)
library(GenomicRanges)
options(width=170)

# Load in and process data (saves to matrices):
# project = 'ALZ_repl'
project = 'ALZ'
commandArgs <- function(){ c(project)}
source(paste0(bindir, 'mosaicism/load_metadata.R'))


# Make directories for analysis:
analysis = 'fusion'
imgdir = paste0(img, project, '/', analysis , '/')
imgpref = paste0(imgdir, analysis, "_mspred_", project, '_')
cmd = paste0('mkdir -p ', imgdir)
system(cmd)

source(paste0(bindir, 'multiRegion/auxiliary_plotting_settings.R'))


# Load in the two cohorts of candidate gene fusion counts:
# --------------------------------------------------------
source(paste0(bindir, 'mosaicism/genefusions/load_fusion_data.R'))
source(paste0(bindir, 'mosaicism/genefusions/auxiliary_fusion_plotting.R'))


# Use only single cohort:
imgpref = paste0(imgpref, "originalcohort_")
sub.ctdf = sub.ctdf[sub.ctdf$cohort == 'ALZ',]


# Regression estimates for a number of selected covariates:
# ---------------------------------------------------------
merge_fsdf = function(df, sub.ctdf){
    fs.ctdf = agg.rename(annots ~ plate + sample,
        df, length, 'subset.count')
    fs.ctdf = merge(sub.ctdf, fs.ctdf, all.x=TRUE)
    fs.ctdf$subset.count[is.na(fs.ctdf$subset.count)] = 0
    return(fs.ctdf)
}


run_regr = function(ctdf, only.exc, use.cc=TRUE, use.ec=TRUE){
    resdf = c()
    if (only.exc){
        df = ctdf[ctdf$relabel == 'Excitatory',]
    } else { df = ctdf }
    if (length(unique(df$platform)) > 1){
        # count.covar = '+ platform * (n_counts + n_genes + count_per_gene)'
        count.covar = '+ platform * (log(n_genes) + count_per_gene)'
    } else {
        # count.covar = '+ n_counts + n_genes + count_per_gene' # original
        count.covar = '+ log(n_genes) + count_per_gene'
    }
    for (var in test.vars){
        # form.vec = c('subset.count', ' ~ ', var) # original
        form.vec = c('subset.count ~ offset(log(n_counts)) + ', var)
        if (use.cc){ form.vec = c(form.vec, count.covar) }
        if (use.ec){ form.vec = c(form.vec, ext.covar) }
        if ((!only.exc) & (length(unique(ctdf$relabel)) > 1)){
            form.vec = c(form.vec, ' + relabel')
        }
        # Fit model + process fit:
        form = asform(c(form.vec))
        fit = try(glm.nb(form, df))
        # fit = try(glm(form, df, family='poisson'))
        if ((class(fit)[1] != 'try-error')[1]){
            cfit = processLmFit(fit)
            cfit$var = var
            resdf = rbind(resdf, cfit[2,])
        }
    }
    resdf = resdf[order(resdf$p),]
    resdf$padj = p.adjust(resdf$p, method='fdr')
    resdf$col = with(resdf, ifelse(padj < 0.05, ifelse(Est < 0, 'Down', 'Up'), 'NS'))
    resdf = resdf[order(resdf$Est),]
    resdf$coef = varmap[resdf$coef]
    resdf$coef = factor(resdf$coef, levels=resdf$coef)
    return(resdf)
}


# Test in each subset of fusion types separately:
# -----------------------------------------------
fulldf = c()
for (only.exc in c(TRUE, FALSE)){
    for (fs.subset in c('INTRA','INTER','CLOSE')){
        if (fs.subset == 'CLOSE'){
            df = fsdf[(fsdf$fs.set == 'INTRA') & (fsdf$Distance < 5e6),]
        } else {
            df = fsdf[fsdf$fs.set == fs.subset,]
        }
        fs.ctdf = merge_fsdf(df, sub.ctdf)

        # Regression results:
        resdf = run_regr(fs.ctdf, only.exc)
        resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells')
        resdf$type = fs.subset
        fulldf = rbind(fulldf, resdf)
    }
}

gp = coefErrorbarPlot(fulldf, facet='set ~ type')
regpref = paste0(imgpref, 'reg_effectsizes_core_type_joint_results_')
saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=4.5)


# Test with other cutoffs:
# ------------------------
# W/ LDAS (582)
# W/ REF SPLICE
# By cell type
# Lower ENTROPY cutoff (max)
fulldf = c()
only.exc = FALSE
for (fs.subset in c('YES_LDAS','NO_LDAS')){
    df = fsdf[fsdf$LargeAnchorSupport == fs.subset,]
    fs.ctdf = merge_fsdf(df, sub.ctdf)
    # Regression results:
    resdf = run_regr(fs.ctdf, only.exc)
    resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells')
    resdf$type = fs.subset
    fulldf = rbind(fulldf, resdf)
}

gp = coefErrorbarPlot(fulldf, facet='set ~ type')
regpref = paste0(imgpref, 'reg_effectsizes_LDAS_')
saveGGplot(gp, paste0(regpref, 'errorbar'),  w=7.5, h=3)


# W/ REF SPLICE (612)
fulldf = c()
only.exc = FALSE
for (fs.subset in c('INCL_NON_REF_SPLICE', 'ONLY_REF_SPLICE')){
    df = fsdf[fsdf$SpliceType == fs.subset,]
    fs.ctdf = merge_fsdf(df, sub.ctdf)
    # Regression results:
    resdf = run_regr(fs.ctdf, only.exc)
    resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells')
    resdf$type = fs.subset
    fulldf = rbind(fulldf, resdf)
}

gp = coefErrorbarPlot(fulldf, facet='set ~ type')
regpref = paste0(imgpref, 'reg_effectsizes_SpliceType_')
saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=3)


# Lower ENTROPY cutoff (max)
fulldf = c()
only.exc = FALSE
for (fs.subset in c(1.5, 1.75, 2)){
    df = fsdf[fsdf$MaxEntropy < fs.subset,]
    fs.ctdf = merge_fsdf(df, sub.ctdf)
    # Regression results:
    resdf = run_regr(fs.ctdf, only.exc)
    resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells')
    resdf$type = paste0('Max Ent < ', fs.subset)
    fulldf = rbind(fulldf, resdf)
}

gp = coefErrorbarPlot(fulldf, facet='set ~ type')
regpref = paste0(imgpref, 'reg_effectsizes_MaxEntropy_')
saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=3)


# Lower ENTROPY cutoff (min)
fsdf$MinEntropy = apply(fsdf[,c('LeftBreakEntropy','RightBreakEntropy')], 1, min)
fulldf = c()
only.exc = FALSE
for (fs.subset in c(1.25, 1.5, 1.75)){
    df = fsdf[fsdf$MinEntropy > fs.subset,]
    fs.ctdf = merge_fsdf(df, sub.ctdf)
    # Regression results:
    resdf = run_regr(fs.ctdf, only.exc)
    resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells')
    resdf$type = paste0('Min Ent > ', fs.subset)
    fulldf = rbind(fulldf, resdf)
}

gp = coefErrorbarPlot(fulldf, facet='set ~ type')
regpref = paste0(imgpref, 'reg_effectsizes_MinEntropy_')
saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=3)


# Stratify by cell type and plot heatmap as well as intervals:
# ------------------------------------------------------------
fulldf = c()
cts = unique(sub.ctdf$relabel)
for (fs.subset in cts){
    df = fsdf
    fs.ctdf = merge_fsdf(df, sub.ctdf)
    fs.ctdf = fs.ctdf[fs.ctdf$relabel == fs.subset,]
    # Regression results:
    resdf = run_regr(fs.ctdf, only.exc=FALSE)
    resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells')
    resdf$type = fs.subset
    fulldf = rbind(fulldf, resdf)
}

gp = coefErrorbarPlot(fulldf, facet='. ~ type')
regpref = paste0(imgpref, 'reg_effectsizes_CellType_')
saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=3)


# Plot the cell type results as a heatmap: 
# -----------------------------------------
cmat = pivot.tomatrix(fulldf[,c('Est','coef','type')], 'type','Est')
pmat = pivot.tomatrix(fulldf[,c('padj','coef','type')], 'type','padj')
mx = 0.5
mat.col_fun = colorRamp2(c(-mx, 0, mx), c("blue", "white", "red")) 

ux = 1.5
ht = Heatmap(cmat, 
    name='Est',
    col=mat.col_fun,
    width = ncol(cmat)*unit(ux, "mm"), 
    height = nrow(cmat)*unit(ux, "mm"),
    border_gp=gpar(color='black', lwd=.5),
    cell_fun = function(j, i, x, y, w, h, col){
        if (pmat[i,j] < 0.05){
            grid.text('*', x, y, gp=gpar(fontsize=gridtxt.fs)) }
    }
)

h = 2 + 1 / 15 * nrow(cmat)
w = 2 + 1 / 15 * ncol(cmat)
pltpref = paste0(imgpref, 'reg_effectsizes_CellType_heatmap')
saveHeatmap(ht, pltpref, h=h, w=w)


# Regressions by TAD distance:
# ----------------------------
# Read in TAD boundaries from 3dgenomebrowser
tdf = read.delim(paste0(dbdir, 'Annotation/Cortex_DLPFC_Donor-CO-raw_TADs.txt'), header=F)
tadf = tdf[,c(1:2)]
tbdf = tdf[,c(1,3)]
names(tadf) = c('chr','pos')
names(tbdf) = c('chr','pos')
tdf = rbind(tadf, tbdf)
tdf = tdf[order(tdf$pos),]
tdf = tdf[order(tdf$chr),]
tdf = unique(tdf)
rownames(tdf) = NULL
tgr = GRanges(seqnames=tdf$chr, IRanges(start=tdf$pos, end=tdf$pos))

# Make right/left objects:
fsdf$LeftChr = sub(":.*","", fsdf$LeftBreakpoint)
fsdf$RightChr = sub(":.*","", fsdf$RightBreakpoint)
lgr = with(fsdf, GRanges(seqnames=LeftChr, IRanges(start=LeftPosition, end=LeftPosition)))
rgr = with(fsdf, GRanges(seqnames=RightChr, IRanges(start=RightPosition, end=RightPosition)))

# Get the closest left + right tads for each
rnear = nearest(rgr, tgr)
fsdf$RightNearPos = tdf$pos[rnear]
fsdf$RightTADDist = abs(fsdf$RightNearPos - fsdf$RightPosition)
lnear = nearest(lgr, tgr)
fsdf$LeftNearPos = tdf$pos[lnear]
fsdf$LeftTADDist = abs(fsdf$LeftNearPos - fsdf$LeftPosition)
fsdf$MinTADDist = apply(fsdf[,c('RightTADDist','LeftTADDist')], 1, min)
fsdf$MaxTADDist = apply(fsdf[,c('RightTADDist','LeftTADDist')], 1, max)

gp = ggplot(fsdf, aes(LeftTADDist, RightTADDist)) + 
    geom_density_2d_filled() + 
    # geom_point(alpha=0.1, cex=.5) + 
    scale_x_continuous(expand=c(0,0), labels=scales::comma) + 
    scale_y_continuous(expand=c(0,0), labels=scales::comma) + 
    theme_pubr()
saveGGplot(gp, paste0(imgpref, 'TAD_distance_2d'), w=6, h=6)


# Cut TAD distance evenly:
qtTAD = quantile(fsdf$MinTADDist, seq(0,1, .25), na.rm=T)
fsdf$TAD.bin = cut(fsdf$MinTADDist, breaks=qtTAD)
gp = ggplot(fsdf, aes(MinTADDist, fill=TAD.bin)) + 
    # geom_histogram(fill='grey80', color='white') + 
    geom_histogram(color='white') + 
    geom_density(color='black') + 
    scale_x_continuous(expand=c(0,0), labels=scales::comma) + 
    scale_y_continuous(expand=c(0,0)) + 
    theme_pubr()
saveGGplot(gp, paste0(imgpref, 'TAD_mindistance'), w=5, h=4)


fulldf = c()
only.exc = FALSE
fsdf$TAD.bin = as.character(fsdf$TAD.bin)
fsdf$TAD.bin[is.na(fsdf$TAD.bin)] = 'Undefined'
sets = unique(fsdf$TAD.bin[order(fsdf$MinTADDist)])
sets = sets[sets != 'Undefined']
for (fs.subset in sets){
    df = fsdf[fsdf$TAD.bin == fs.subset,]
    fs.ctdf = merge_fsdf(df, sub.ctdf)
    # Regression results:
    resdf = run_regr(fs.ctdf, only.exc)
    resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells')
    resdf$type = fs.subset
    fulldf = rbind(fulldf, resdf)
}
fulldf$type = factor(fulldf$type, levels=sets)
gp = coefErrorbarPlot(fulldf, facet='set ~ type')
regpref = paste0(imgpref, 'reg_effectsizes_MinTADDist_')
saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=3)


# By Max TAD bin instead:
# -----------------------
qtTAD = quantile(fsdf$MaxTADDist, seq(0,1, .25), na.rm=T)
fsdf$TAD.bin = cut(fsdf$MaxTADDist, breaks=qtTAD)
gp = ggplot(fsdf, aes(MaxTADDist, fill=TAD.bin)) + 
    geom_histogram(color='white') + 
    geom_density(color='black') + 
    scale_x_continuous(expand=c(0,0), labels=scales::comma) + 
    scale_y_continuous(expand=c(0,0)) + 
    theme_pubr()
saveGGplot(gp, paste0(imgpref, 'TAD_maxdistance'), w=5, h=4)


fulldf = c()
only.exc = FALSE
fsdf$TAD.bin = as.character(fsdf$TAD.bin)
fsdf$TAD.bin[is.na(fsdf$TAD.bin)] = 'Undefined'
sets = unique(fsdf$TAD.bin[order(fsdf$MaxTADDist)])
sets = sets[sets != 'Undefined']
for (fs.subset in sets){
    df = fsdf[fsdf$TAD.bin == fs.subset,]
    fs.ctdf = merge_fsdf(df, sub.ctdf)
    # Regression results:
    resdf = run_regr(fs.ctdf, only.exc)
    resdf$set = ifelse(only.exc, 'Exc. neurons', 'All cells')
    resdf$type = fs.subset
    fulldf = rbind(fulldf, resdf)
}
fulldf$type = factor(fulldf$type, levels=sets)

gp = coefErrorbarPlot(fulldf, facet='set ~ type')
regpref = paste0(imgpref, 'reg_effectsizes_MaxTADDist_')
saveGGplot(gp, paste0(regpref, 'errorbar'), w=10, h=3)