#!/usr/bin/R
# ------------------------------------------------
# Plot the markers and read counts for revision QC
# Updated: 03/28/23
# ------------------------------------------------
library(cbrbase)
set_proj('DEVTRAJ')

# Libraries:
library(tidyr)
library(MASS)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(ggbeeswarm)
library(scales)
library(RColorBrewer)
library(Matrix)
options(width=170)

# Load in the two cohorts of candidate gene fusion counts:
# --------------------------------------------------------
gbdir = paste0(bindir, 'mosaicism/genefusions/')
# source(paste0(gbdir, 'load_fusion_data.R'))
# source(paste0(gbdir, 'auxiliary_fusion_plotting.R'))
# source(paste0(gbdir, 'load_go_signatures.R'))
source(paste0(gbdir, 'load_joint_expression.R'))

source(paste0(bindir, 'multiRegion/auxiliary_plotting_settings.R'))


# Make directories for analysis:
# ------------------------------
analysis = 'fusion'
project = 'ALZ'
imgdir = paste0(img, project, '/', analysis , '/')
imgpref = paste0(imgdir, 'qcstats_', project, '_')
cmd = paste0('mkdir -p ', imgdir)
system(cmd)


# Subset to used cohort:
meta = meta[meta$cohort == 'ALZ',]
meta = meta[meta$relabel != 'Other/Cancer',]
kept.cells = meta$sample


# Plot the basic QC statistics:
# -----------------------------
# Aggregate the relevant statistics:
meta$log10_n_counts = log10(meta$n_counts)
statvars = c('n_genes', 'log10_n_counts', 'percent_mito', 
    'percent_ribo', 'count_per_gene')
df = gather(meta[,c('relabel', 'sample', statvars)], var, value, -relabel, -sample)
relabel.col['Undefined/Dying'] = 'grey75'

# Add comparisons:
df$relabel = factor(df$relabel, levels=rev(levels(meta$relabel)))
gp = ggplot(df, aes(relabel, value, fill=relabel)) + 
    facet_wrap(~var, scales='free_x') +
    scale_fill_manual(values=relabel.col) + 
    geom_boxplot(outlier.size=.5) + coord_flip() + 
    theme_pubr()
pltprefix = paste0(imgpref, 'stats_barplot')
saveGGplot(gp, pltprefix, w=10, h=5)


# Load in the marker genes, top 2 per class:
# ------------------------------------------
broad.df = read.delim(paste0(dbdir, 'Annotation/broad_ct_markers.tsv'), header=T)
markerdf = read.delim(paste0(dbdir, 'Annotation/markers_brain_major_celltypes.tsv'), header=T)
clusters = c('CAMs', 'Per', 'SMC', 'T cells', 'Fib')
marklist = lapply(clusters, function(x, ntop=2){
    subdf = markerdf[markerdf$cluster == x,]
    head(subdf[,c('cluster','gene')], ntop) })
markerdf = do.call(rbind, marklist)
names(markerdf) = c('cell', 'gene')
markerdf = rbind(broad.df, markerdf)


# Plot the marker gene expressions:
# ---------------------------------
tform = make.tform(meta$relabel, u=as.character(unique(meta$relabel)))
mat = as.matrix(fullmat[, meta$sample] %*% tform)
# library size normalization:
norm = log(sweep(mat, 2, colSums(mat) / 1e6, '/') + .1)

marker.ensg = anno$ENSG[anno$symbol %in% markerdf$gene]
mark.norm = norm[marker.ensg,]
rownames(anno) = anno$ENSG
rownames(mark.norm) = anno[rownames(mark.norm), 'symbol']
mark.norm = mark.norm[markerdf$gene,]
pltmat = t(mark.norm)

load.colors()

ux = 2
ht = Heatmap(pltmat, viridis(100), 
    width = ncol(pltmat)*unit(ux, "mm"), 
    height = nrow(pltmat)*unit(1.5, "mm"), # Splits are too much
    column_split=markerdf$cell, 
    border_gp=gpar(color='black', lwd=.5),
    cluster_rows=FALSE,
    use_raster=FALSE)

h = 2 + 1 / 15 * nrow(pltmat)
w = 2 + 1 / 15 * ncol(pltmat)
pltpref = paste0(imgpref, 'markergenes_heatmap')
saveHeatmap(ht, pltpref, h=h, w=w)