#!/usr/bin/R
# -----------------------------------------------
# Plot stats for fusions in the CKP25 mouse data, 
# stratified by cohesin genes and by DMG genes
# Updated: 08/28/2022
# -----------------------------------------------
library(cbrbase)
set_proj('DEVTRAJ', 'mosaicism')

library(ggplot2)
library(ggpubr)
library(ggbeeswarm)
library(tidyr)
library(scales)
library(RColorBrewer)
library(viridis)
library(ComplexHeatmap)
options(width=170)

source(paste0(sbindir, 'CKP25/load_metadata.R'))
source(paste0(sbindir, 'CKP25/load_counts.R'))
source(paste0(bindir, 'multiRegion/auxiliary_plotting_settings.R'))

# Directories:
imgdir = paste0(img, 'CKP25/')
imgdir2 = paste0(imgdir, 'fusions/')
imgpref = paste0(imgdir2, 'fusions_')
cmd = paste('mkdir -p',imgdir, imgdir2)
system(cmd)

source(paste0(sbindir, 'CKP25/analysis/load_basic_fusions.R'))
source(paste0(sbindir, 'CKP25/analysis/load_umap_basis.R'))
source(paste0(sbindir, 'CKP25/analysis/load_go_signatures.R'))

load.colors()


# Score GO and signature gene sets:
# ---------------------------------
filepref = paste0(dbdir, 'CKP25/CKp25_annotation_scores')
enr.rds = paste0(filepref, '_broad.Rds')
enr.tsv = paste0(filepref, '_broad.tsv.gz')
fullenr.rds = paste0(filepref, '_specific.Rds')
kwds = names(siglist)

if (!file.exists(enr.rds)){
    marg = marg[colnames(amat)]
    enrdf = c()
    full.enrdf = c()
    for (kwd in kwds){
        # Overall score for all genes in gene set:
        genes = siglist[[kwd]]
        ngenes = length(genes)
        genes = genes[genes %in% rownames(amat)]
        cs = log(10000 * colSums(amat[genes,]) / marg / ngenes + 1)
        enrdf = rbind(enrdf, data.frame(cell=names(cs), 
                enr=cs, tag=kwd, ng=ngenes))

        if (kwd %in% names(full.siglist)){
            lset = full.siglist[[kwd]]
            for (term in names(lset)){
                cs = log(10000 * colSums(amat[lset[[term]],]) / marg + 1)
                full.enrdf = rbind(full.enrdf, data.frame(
                        cell=names(cs), enr=cs, tag=kwd, term=term))
            }
        }
    }
    enrdf = merge(enrdf, smeta)
    full.enrdf = merge(full.enrdf, smeta)

    # Save terms:
    saveRDS(enrdf, file=enr.rds)
    write.table(enrdf, gzfile(enr.tsv), quote=F, row.names=F, sep="\t")
    saveRDS(full.enrdf, file=fullenr.rds)
} else { 
    enrdf = readRDS(enr.rds)
    full.enrdf = readRDS(fullenr.rds)
}


# Plot each term on the UMAP:
# ---------------------------
col_fun = function(x, pal=colrb, mx=2.5, rescale=FALSE){
    palette = rev(pal)
    if (rescale){ x = scale(x) }
    x[x > mx] = mx
    x[x < -mx] = -mx
    bin <- cut(x, seq(-mx, mx, length.out=length(palette)),
        include.lowest=T) 
    palette[bin] }

for (kwd in kwds){
    print(kwd)
    subdf = enrdf[enrdf$tag == kwd,]
    kwdstr = gsub(" ", "_", kwd)
    pltpref = paste0(imgpref, 'annotation_scores_umap_', kwdstr)

    png(paste0(pltpref, '.png'),res=300, width=3,height=3, units='in')
    par(mar=c(0,0,0,0))
    plot(subdf[[u1]], subdf[[u2]], pch=19, 
        col=col_fun(subdf$enr, pal=colrb, rescale=TRUE),
        axes=F, ylab='', xlab='', cex=.5)
    dev.off()

    # With updated coordinates for Dileep paper
    png(paste0(pltpref, '_V1V2.png'),res=300, width=3,height=3, units='in')
    par(mar=c(0,0,0,0))
    plot(subdf$V1, subdf$V2, pch=19, 
        col=col_fun(subdf$enr, pal=colrb, rescale=TRUE),
        axes=F, ylab='', xlab='', cex=.5)
    dev.off()
}


# Plot corresponding legend:
# --------------------------
r.legend = Legend(at=seq(-2.5,2.5, 1), 
                  labels_gp = gpar(fontsize=5),
                  title_gp = gpar(fontsize=5, fontface='bold'),
                  col_fun=col_fun, title_position = "topleft", 
                  background='black', legend_height=unit(.25,'in'),
                  legend_width=unit(1.25,'in'),
                  title='z-scored expr.', direction = 'horizontal')
rlegend = packLegend(r.legend)

pdf(paste0(imgpref, 'annotation_score_umap_legend.pdf'), width=2, height=2)
draw(rlegend, x = unit(.90,'in'), y=unit(.6,'in'), just = "top")
dev.off()


# Plot relative enrichment of different subtypes:
# -----------------------------------------------
subdf = enrdf[enrdf$label %in% c('Excitatory','Stage 2'),]

runcomp <- list(c("Ex0", "Ex2"), c("Ex0", "Ex3"), c("Ex0", "St.2"), 
                c('Ex2', 'Ex3'), c('Ex2', 'St.2'), c('Ex3', 'St.2'))

gplot = ggplot(subdf, aes(short.sublabel, enr, color=sublabel, fill=sublabel)) + 
    facet_wrap(~tag, nrow=1, scale='free_y') + 
    scale_color_manual(values=sublabel.col) + 
    scale_fill_manual(values=sublabel.col) + 
    geom_violin(fill=NA) + 
    geom_quasirandom(varwidth=TRUE, size=.3) +
    labs(x=paste('Sub-celltype'), y='Norm. signature expression, log(X+1)') + 
    theme_pubr() + theme(legend.position='none') + 
    stat_compare_means(comparisons=runcomp, label='p.signif', method='t.test',
                       bracket.size=.2, tip.length=0.01, vjust=.5, step.increase=0.06)

pltpref = paste0(imgpref, 'signatures_terms_expressionlevels_violin')
ggsave(paste0(pltpref, '.png'), gplot, dpi=450, units='in', width=9, height=4)
ggsave(paste0(pltpref, '.pdf'), gplot, dpi=450, units='in', width=9, height=4)


# Merge at the individual level and plot individual-level boxplots:
# -----------------------------------------------------------------
subdf = enrdf[enrdf$label %in% c('Excitatory', 'Stage 2'),]
aggdf = aggregate(enr ~ mouse_id + sublabel + label + short.sublabel + tag + genotype, subdf, mean)
aggdf$short.sublabel = factor(aggdf$short.sublabel, levels=c('In0','In1','Ex0','Ex1','Ex2','Ex3','St.2'))

gplot = ggplot(aggdf, aes(short.sublabel, enr, fill=sublabel)) + 
    facet_wrap(~tag, nrow=1, scale='free_y') + 
    scale_fill_manual(values=sublabel.col) + 
    geom_boxplot(outlier.shape=NA, alpha=1, lwd=.5) + 
    geom_jitter(position=position_jitterdodge(jitter.width=.15, dodge.width=.75), cex=.8) + 
    labs(x=paste('Sub-celltype'), y='Norm. signature expression, log(X+1)') + 
    theme_pubr() + theme(legend.position='none') + 
    stat_compare_means(comparisons=runcomp, label='p.signif', method='t.test',
                       bracket.size=.2, tip.length=0.01, vjust=.5, step.increase=0.06)

pltpref = paste0(imgpref, 'signatures_terms_expressionlevels_indiv_violin')
ggsave(paste0(pltpref, '.png'), gplot, dpi=450, units='in', width=9, height=4)
ggsave(paste0(pltpref, '.pdf'), gplot, dpi=450, units='in', width=9, height=4)


# Repeat just for the double-strand break pathways
# -----------------------------------------------------------------
subdf = full.enrdf[full.enrdf$label %in% c('Excitatory', 'Stage 2') & full.enrdf$tag == 'double-strand break repair',]
aggdf = aggregate(enr ~ mouse_id + sublabel + label + short.sublabel + term + genotype, subdf, mean)
aggdf$termname = termmap[aggdf$term]
rep.terms = list('double-strand break repair'='DSB repair',
    'nonhomologous end joining'='NHEJ', 'via'='via',
    'homologous recombination'='HR',
    'regulation'='reg.', 'positive'='pos.', 'negative'='neg.')
for (i in 1:length(rep.terms)){ aggdf$termname = sub(names(rep.terms)[i], rep.terms[i], aggdf$termname) }
aggdf$short.sublabel = factor(aggdf$short.sublabel, levels=c('In0','In1','Ex0','Ex1','Ex2','Ex3','St.2'))

gplot = ggplot(aggdf, aes(short.sublabel, enr, fill=sublabel)) + 
    facet_wrap(~termname, nrow=2, scale='free_y') + 
    scale_fill_manual(values=sublabel.col) + 
    geom_boxplot(outlier.shape=NA, alpha=1, lwd=.5) + 
    geom_jitter(position=position_jitterdodge(jitter.width=.15, dodge.width=.75), cex=.8) + 
    labs(x=paste('Sub-celltype'), y='Norm. signature expression, log(X+1)') + 
    theme_pubr() + theme(legend.position='none') + 
    stat_compare_means(comparisons=runcomp, label='p.signif', method='t.test', hide.ns=TRUE,
                       bracket.size=.2, tip.length=0.01, vjust=.5, step.increase=0.06)

pltpref = paste0(imgpref, 'signatures_DSB_terms_expressionlevels_indiv_violin')
ggsave(paste0(pltpref, '.png'), gplot, dpi=450, units='in', width=10, height=5)
ggsave(paste0(pltpref, '.pdf'), gplot, dpi=450, units='in', width=10, height=5)


# Plot the specific pathways for each kwd:
# ----------------------------------------
load.colors()
rownames(meta) = meta$cell

subdf = full.enrdf[full.enrdf$label %in% c('Excitatory', 'Stage 2'),]
for (kwd in kwds){
    kwddf = subdf[subdf$tag == kwd,]
    kwddf$termname = termmap[kwddf$term]
    kmat = pivot.tomatrix(kwddf[,c('enr','termname','cell')], 'cell','enr')

    ctlab = meta[colnames(kmat), 'sublabel2']
    ha = HeatmapAnnotation(
        Label=ctlab, 
        Genotype=meta[colnames(kmat), 'genotype'],
        Mouse=meta[colnames(kmat), 'mouse_id'],
        col=list(
            Label=sublabel.col,
            Genotype=c('CK'='grey80', 'CKp25'='indianred')))

    # pltmat = sweep(kmat, 1, apply(kmat, 1, max), '/')
    # NOTE: double-strand break repair is notable (break-induced replication)
    pltmat = t(scale(t(kmat)))
    Heatmap(pltmat, 
        top_annotation = ha,
        show_column_names=FALSE,
        column_split = ctlab
    )
}
    
# Correlations of these scores (nothing notable), driven by ovl. mostly:
emat = pivot.tomatrix(enrdf[,c('enr','tag','cell')], 'cell', 'enr')
ht = Heatmap(cor(t(emat)), 
    col=rev(colspec),
    show_column_names=FALSE,
)


# Fusions versus each of the overall scores:
# ------------------------------------------
# Merge the fusion and GO expression data first
ind = sample(1:nrow(enrdf), nrow(enrdf), replace=FALSE)
enrdf = enrdf[ind,]
# enrdf$has_intra = enrdf$cell %in% intradf$cell[intradf$close.fusion]
enrdf$has_intra = enrdf$cell %in% intradf$cell
enrdf$has_inter = enrdf$cell %in% interdf$cell
enrdf$event_count = (enrdf$has_intra + 2 * enrdf$has_inter)
enrdf = enrdf[order(enrdf$event_count, decreasing=F),]
enrdf$has_event = enrdf$event_count > 0

enrdf$event_type = 'None'
enrdf$event_type[enrdf$event_count == 1] = 'Intra'
enrdf$event_type[enrdf$event_count == 2] = 'Inter'
enrdf$event_type[enrdf$event_count == 3] = 'Both'

subdf = enrdf[,c('has_intra', 'has_inter','cell','enr','tag')]
subdf = gather(subdf, type, value, -cell, -enr, -tag)
subdf$type = ifelse(subdf$type == 'has_intra', 'Intra', 'Inter')

# For each, stratify by fusions - diff in expr.
gplot = ggplot(subdf, aes(type, enr, fill=value)) + 
    facet_wrap(~ tag, nrow=1, scale='free_y') + 
    scale_fill_manual(values=c('grey80', 'grey50'), name='Has event') + 
    geom_boxplot(outlier.size=.8) + 
    labs(x=paste('Type of putative fusion event'),
        y='Norm. signature expression, log(X+1)',
        title='GO signatures in Excitatory/Stage 2 cells, stratified by presence of fusion event') + 
    theme_pubr() + theme() + 
    stat_compare_means(label='p.signif', method='t.test', vjust=.5, hjust=.5)

pltpref = paste0(imgpref, 'signatures_terms_vs_fusionevents_boxplot')
ggsave(paste0(pltpref, '.png'), gplot, dpi=450, units='in', width=15, height=4)
ggsave(paste0(pltpref, '.pdf'), gplot, dpi=450, units='in', width=15, height=4)


# Plot terms as a heatmap, annotate with # fusions:
# -------------------------------------------------
subdf = enrdf[enrdf$label %in% c('Excitatory','Stage 2'),]
subdf = subdf[subdf$tag %in% subkwds,]
mat = pivot.tomatrix(subdf[,c('cell','tag','enr')], 'cell','enr')
evdf = unique(enrdf[,c('cell', 'has_intra', 'has_inter','event_type')])
evdf$has_fusion = evdf$has_intra | evdf$has_inter
rownames(evdf) = evdf$cell

ux = 1.5
ctlab = meta[colnames(mat), 'sublabel2']
ha = HeatmapAnnotation(
    Label=ctlab, 
    Genotype=meta[colnames(mat), 'genotype'],
    HasFusion=ifelse(evdf[colnames(mat), 'has_intra'], 'Yes','No'),
    annotation_name_gp = gpar(fontsize=5),
    simple_anno_size = unit(ux / 1.2, 'mm'),
    gap = unit(0, "mm"),
    col=list(
        Label=sublabel.col,
        HasFusion=c('No'='white', 'Yes'='black'),
        Genotype=c('CK'='grey80', 'CKp25'='indianred')))

set.seed(1)
pltmat = t(scale(t(mat)))
ht = Heatmap(pltmat, 
    name='z-score',
    top_annotation=ha,
    show_column_names=FALSE,
    use_raster=FALSE,
    column_split=ctlab,
    width = ncol(pltmat)*unit(ux / 10, "mm"), 
    height = nrow(pltmat)*unit(ux, "mm"),
    border_gp=gpar(color='black', lwd=.5),
)

h = 2 + 1 / 15 * nrow(pltmat)
w = 2 + 1 / 15 * ncol(pltmat) / 10
pltpref = paste0(imgpref, 'annotation_scores_heatmap')
saveHeatmap(ht, pltpref, h=h, w=w)


# Plot the scores + fusions on the pseudotime:
# --------------------------------------------
# Load in pseudotime:
dbpref = paste0(dbdir, 'CKP25/split_trajectory')
scefit.rda = paste0(dbpref,'_dpt_GAMfit_dataset.Rda')
load(scefit.rda)

mo = attr(sce$tradeSeq$X, 'model.offset')
dptdf = data.frame(
    dpt=sce$slingshot$pseudotime.V1, 
    cell=names(mo))

# Merged dataframe
subdf = enrdf[enrdf$tag %in% subkwds ,c('cell','enr','tag')]
subev = evdf[,c('cell', 'has_fusion')]
names(subev)[2] = 'enr'
subev$enr = subev$enr * 1
subev$tag = 'has_fusion'

subdf = rbind(subdf, subev)
subdf = merge(subdf, dptdf)

gplot = ggplot(subdf, aes(dpt, enr, color=tag)) + 
    facet_wrap(~tag, scales='free_y') + 
    geom_point(cex=.5) + 
    geom_smooth(method='gam') + 
    labs(y=paste0('log10(Expr)'), x='Diffusion Pseudotime') + 
    theme_pubr()

ggsave(paste0(imgpref, 'annotation_scores_dpt.png'), gplot, dpi=400, units='in', width=8, height=9)
ggsave(paste0(imgpref, 'annotation_scores_dpt.pdf'), gplot, dpi=400, units='in', width=8, height=9)


# Reduced:
gplot = ggplot(subdf, aes(dpt, enr, color=tag)) + 
    geom_smooth(method='gam') + 
    labs(y=paste0('log10(Expr)'), x='Diffusion Pseudotime') + 
    scale_x_continuous(expand=c(0,0)) + 
    theme_pubr() + theme(legend.position='right') 

ggsave(paste0(imgpref, 'annotation_scores_dpt_joint.png'), gplot, dpi=400, units='in', width=4.75, height=3)
ggsave(paste0(imgpref, 'annotation_scores_dpt_joint.pdf'), gplot, dpi=400, units='in', width=4.75, height=3)


# Perform regressions and plot coefficients by set:
# -------------------------------------------------
subdf = enrdf[enrdf$label %in% c('Excitatory', 'Stage 2'),]
form = 'has_event ~ enr + n_counts + n_genes + genotype + timepoint'
form = as.formula(form)

resdf = c()
for (kwd in kwds){
    df = subdf[subdf$tag == kwd,]
    fit = lm(form, df)
    cfit = processLmFit(fit)
    cfit$tag = kwd
    resdf = rbind(resdf, cfit[grep('enr', cfit$coef),])
}

resdf = resdf[order(resdf$p),]
resdf$padj = p.adjust(resdf$p, method='fdr')
resdf$col = with(resdf, ifelse(padj < 0.05, ifelse(Est < 0, 'Down', 'Up'), 'NS'))
resdf = resdf[order(resdf$Est),]
resdf$tag = factor(resdf$tag, levels=resdf$tag)


# Plot all of these effects jointly:
# ----------------------------------
kwddf = resdf[resdf$tag %in% subkwds,]
pcols = brewer.pal(12, 'Paired')
devals = c('NS'='grey80', 'Down'=pcols[2], 'Up'=pcols[6])

gp = ggplot(kwddf, aes(tag, Est, ymin=Est-2*SE, ymax=Est+2*SE, label=p, color=factor(col))) + 
    geom_point() + 
    geom_errorbar(width=.2) + 
    geom_hline(yintercept=0, lty='dotted') +
    scale_color_manual(values=devals, name='Significance:') + 
    theme_pubr() + coord_flip() + 
    labs(x='Condition', y='Estimated effect size')

regpref = paste0(imgpref, 'p25_reg_effectsizes_sigs_')
saveGGplot(gp, paste0(regpref, 'errorbar'), w=4.5, h=3)