#!/usr/bin/R
# ----------------------------------------------
# Join the processed gene fusion datasets
# Updated: 01/27/22
# ----------------------------------------------
library(cbrbase)
set_proj('DEVTRAJ')

# Libraries:
library(tidyr)
library(ggplot2)
library(ggpubr)
library(RColorBrewer)

# Load in and process data (saves to matrices):
# project = 'ALZ_repl'
project = 'ALZ'
commandArgs <- function(){ c(project)}
source(paste0(bindir, 'mosaicism/load_metadata.R'))


# Load in the two cohorts of candidate gene fusion counts:
# --------------------------------------------------------
fs1dir = paste0(dbdir,'variants/', 'ALZ', '/fusion/')
fs2dir = paste0(dbdir,'variants/', 'ALZ_repl', '/fusion/')

# filesuff = 'maxsensitivity_prediction_numbers_nochrM' # Old filtering, insufficient
filesuff = 'maxsensitivity_prediction_numbers_wmeta'
o1file = paste0(fs1dir, 'ALZ', '_', filesuff, '.tsv')
o2file = paste0(fs2dir, 'ALZ_repl', '_', filesuff, '.tsv')
ct1df = read.delim(o1file, header=T)
ct2df = read.delim(o2file, header=T)
ct1df$cohort = 'ALZ'
ct2df$cohort = 'ALZ_repl'

# Fix the plate mapping:
pmap = read.delim('ALZ_repl_plate_mapping.tsv', header=T)
ct1df$projid = ct1df$plate
ct1df$platform = '40SE'
ct2df = merge(ct2df, pmap[,c('plate','projid')], all.x=TRUE)
ct2df$platform = '150PE'
ct2df$platform[ct2df$plate %in% c('MM','NN','OO','PP')] = '250PE'
ctdf = rbind(ct1df, ct2df[,colnames(ct1df)])
countcutoff = 25000
sub.ctdf = ctdf[ctdf$n_counts >= countcutoff,]

metamap = unique(ctdf[,c('plate','projid','platform','cohort')])


# Merge the raw fusion datasets as well:
# --------------------------------------------------
fssuff = 'maxsensitivity_abridged_fusions_filtered'
fs1file = paste0(fs1dir, 'ALZ', '_', fssuff, '.tsv')
fs2file = paste0(fs2dir, 'ALZ_repl', '_', fssuff, '.tsv')

fs1df = read.delim(fs1file, header=T)
fs2df = read.delim(fs2file, header=T)
fsdf = rbind(fs1df, fs2df)
colnames(fsdf)[colnames(fsdf) == 'projid'] = 'plate'
fsdf = merge(fsdf, metamap, all.x=TRUE)


# Write out both of these tables:
# -------------------------------
outpref = paste0(dbdir,'variants/ALZ_joint_fusion_datasets_')
count.tsv = paste0(outpref, filesuff, '.tsv')
fusion.tsv = paste0(outpref, fssuff, '.tsv')

write.table(ctdf, count.tsv, quote=F, row.names=F, col.names=T, sep="\t")
write.table(fsdf, fusion.tsv, quote=F, row.names=F, col.names=T, sep="\t")