#!/usr/bin/R
# ------------------------------------------------------
# Load, process, flag, and filter fusions for 10x human:
# - follows analysis/filtering done for P301s/smart-seq
# Updated: 02/02/23
# ------------------------------------------------------
library(cbrbase)
library(ggplot2)
library(ggpubr)

set_proj('DEVTRAJ', 'mosaicism')
source(paste0(bindir, 'mosaicism/fusions_AD430/load_metadata.R'))
source(paste0(bindir, 'multiRegion/auxiliary_plotting_settings.R'))

# Directories:
plotdir = paste0(imgdir, 'fusions/')
imgpref = paste0(plotdir, 'AD430_fusions_')
cmd = paste('mkdir -p', plotdir)
system(cmd)


# Load the preliminary fusion files:
# ----------------------------------
fsdir = paste0(dbdir, 'fusions_AD430/')
prefix = 'AD430.aggregated.fusion_predictions'
predfile = paste0(fsdir, prefix, '.tsv')
bcfile = paste0(fsdir, prefix, '.byread.withBC.tsv')
procfile = paste0(fsdir, prefix, '.processed.tsv')
filtfile = paste0(fsdir, prefix, '.filtered.nonmerge.tsv')

fsdf = read.delim(predfile, header=T)
bcdf = read.delim(bcfile, header=T)

fsdf$fsid = with(fsdf, paste(fusionID, projid, sample, lane, repl, sep="-"))
bcdf$fsid = with(bcdf, paste(fusionID, projid, sample, lane, repl, sep="-"))


# Flag reads mapped to multiple fusion events:
# --------------------------------------------
flag.read.df = agg.rename(uqID ~ Read, bcdf, length, 'nfusion')
flag.reads = flag.read.df$Read[flag.read.df$nfusion > 1]
bcdf$flag.nonunique = bcdf$Read %in% flag.reads

# Count how many reads to remove:
flag.nu = aggregate(flag.nonunique ~ fsid + JunctionReadCount, bcdf, sum)
print(paste("[STATUS] Flagged", length(flag.reads), "reads"))


# Check for repeated fusions across cells:
# ----------------------------------------
# First, flag breakpoints that repeat across cells or mice:
# Right:
agg.bcdf = aggregate(cbind(sample, lane, CB, UMI) ~ RightBreakpoint, bcdf, function(x){length(unique(x))})
agg.bcdf$flag = (agg.bcdf$sample > 1) | (agg.bcdf$CB > 1)
flag.right = agg.bcdf$RightBreakpoint[agg.bcdf$flag]
# Left:
agg.bcdf = aggregate(cbind(sample, lane, CB, UMI) ~ LeftBreakpoint, bcdf, function(x){length(unique(x))})
agg.bcdf$flag = (agg.bcdf$sample > 1) | (agg.bcdf$CB > 1)
flag.left = agg.bcdf$LeftBreakpoint[agg.bcdf$flag]
# Add to bcdf:
bcdf$flag.bkpt = (bcdf$LeftBreakpoint %in% flag.left) | (bcdf$RightBreakpoint %in% flag.right)
print(paste("[STATUS] Flagged", length(flag.right), '(right) and', length(flag.left), "(left) repeated breakpoints"))


# Check for highly promiscuous/recurring genes:
# ---------------------------------------------
fsdf$LeftSymbol = sub("\\^.*","", fsdf$LeftGene)
fsdf$RightSymbol = sub("\\^.*","", fsdf$RightGene)

# Merge lanes before call promiscuous breaks:
sample.fsdf = unique(fsdf[,c('sample','RightBreakpoint','LeftBreakpoint', 'RightSymbol','LeftSymbol')])

# Plot the number of called fusions in the dataset:
fs.genes = c(sample.fsdf$RightSymbol, sample.fsdf$LeftSymbol)
countdf = data.frame(table(fs.genes))
colnames(countdf)[1] = 'gene'
countdf = countdf[order(countdf$Freq, decreasing=T),]
countdf$gene = factor(countdf$gene, levels=rev(countdf$gene))

topctdf = head(countdf, 25)
gp = ggplot(topctdf, aes(gene, Freq, label=Freq)) + 
    geom_bar(stat='identity', fill='grey75') + 
    scale_y_continuous(expand=c(0,0), labels=scales::comma) + 
    geom_text(aes(hjust=ifelse(Freq > 1000, 1.1, -.1))) + 
    theme_pubr() + 
    labs(x='Gene', y='Number of candidate fusions', title='Top genes in fusion candidates') + 
    coord_flip()

saveGGplot(gp, paste0(imgpref, 'topgenes_nhits_barplot'), w=5, h=4.5)


# Breakpoints are also problematic (merge lanes before calling):
fs.bks = c(sample.fsdf$RightBreakpoint, sample.fsdf$LeftBreakpoint)
bkdf = data.frame(table(fs.bks))
colnames(bkdf)[1] = 'bkpt'
bkdf = bkdf[order(bkdf$Freq, decreasing=T),]
bkdf$bkpt = factor(bkdf$bkpt, levels=rev(bkdf$bkpt))

# Allow bkpt merge by celltype
topctdf = head(bkdf, 25)
gp = ggplot(topctdf, aes(bkpt, Freq, label=Freq)) + 
    geom_bar(stat='identity', fill='grey75') + 
    scale_y_continuous(expand=c(0,0), labels=scales::comma) + 
    geom_text(aes(hjust=ifelse(Freq > 1000, 1.1, -.1))) + 
    theme_pubr() + 
    labs(x='Breakpoint', y='Number of candidate fusions', title='Top breakpoints') + 
    coord_flip()

saveGGplot(gp, paste0(imgpref, 'topbreakpoints_nhits_barplot'), w=5, h=4.5)


# Subset the fusion candidates to only high confidence ones:
# ----------------------------------------------------------
# First keep only unique breakpoints to eliminate repetitive genes:
# NOTE: Almost, except for IGH/IGL, MALAT1, SNGH14, and mito/ribo genes
rep.breaks = as.character(bkdf$bkpt[bkdf$Freq >= 2])
rep.breaks = unique(c(rep.breaks, flag.right, flag.left))
sub.fsdf = fsdf[!((fsdf$RightBreakpoint %in% rep.breaks) | (fsdf$LeftBreakpoint %in% rep.breaks)),]

# Also remove the non-unique read mappings (although taken care of by bkpts)
sub.fsdf = merge(sub.fsdf, flag.nu, all.x=TRUE)
ind = (sub.fsdf$JunctionReadCount - sub.fsdf$flag.nonunique) > 0
sub.fsdf = sub.fsdf[ind,]

# Re-count unique genes:
fs.genes = c(sub.fsdf$RightSymbol, sub.fsdf$LeftSymbol)
countdf = data.frame(table(fs.genes))
colnames(countdf)[1] = 'gene'
countdf = countdf[order(countdf$Freq, decreasing=T),]
countdf$gene = factor(countdf$gene, levels=rev(countdf$gene))

# Plot post-filter:
topctdf = head(countdf, 25)
gp = ggplot(topctdf, aes(gene, Freq, label=Freq)) + 
    geom_bar(stat='identity', fill='grey75') + 
    scale_y_continuous(expand=c(0,0), labels=scales::comma) + 
    geom_text(aes(hjust=ifelse(Freq > 1000, 1.1, -.1))) + 
    theme_pubr() + 
    labs(x='Gene', y='Number of candidate fusions', title='Top genes in fusion candidates (unique breakpoints)') + 
    coord_flip()

saveGGplot(gp, paste0(imgpref, 'topgenes_nhits_uqbkpt_barplot'), w=5, h=4.5)


# Next flag the other artifact-prone genes:
uqsymbols = unique(fs.genes)
mito.genes = uqsymbols[c(
        grep("^MT-", uqsymbols), 
        grep("^MTND", uqsymbols), 
        grep("^MTRNR", uqsymbols))]
ribo.genes = unique(uqsymbols[c(
        grep("^RPL", uqsymbols), 
        grep("^RPS", uqsymbols), 
        grep("^RP[0-9]", uqsymbols))])
pseudo.genes = unique(uqsymbols[c(
        grep("^A[CPLF][0-9]*\\.[0-9]", uqsymbols))])
ig.loc = uqsymbols[grep("^IG[HL].*ext",uqsymbols)]
flagged.symbols = c(mito.genes, ribo.genes, 
    pseudo.genes, ig.loc, 'MALAT1', 'SNHG14')

sub.fsdf$LeftFlagged = sub.fsdf$LeftSymbol %in% flagged.symbols
sub.fsdf$RightFlagged = sub.fsdf$RightSymbol %in% flagged.symbols

# Subset again:
sub.fsdf = sub.fsdf[(!sub.fsdf$LeftFlagged) & (!sub.fsdf$RightFlagged),]
fs.genes = c(sub.fsdf$RightSymbol, sub.fsdf$LeftSymbol)
countdf = data.frame(table(fs.genes))
colnames(countdf)[1] = 'gene'
countdf = countdf[order(countdf$Freq, decreasing=T),]
countdf$gene = factor(countdf$gene, levels=rev(countdf$gene))


# Annotate and merge to sample level:
# -----------------------------------
# Add the barcodes for each cell:
bcfsdf = merge(sub.fsdf, unique(bcdf[,c('fsid','CB')]), all.x=TRUE)
nrow(sub.fsdf) == nrow(bcfsdf)

# Aggregate fusions to sample level, merging lanes:
merged.fsdf = aggregate(JunctionReadCount ~ LeftBreakpoint + RightBreakpoint + LeftSymbol + RightSymbol +
    SpliceType + projid + sample + CB + annots, bcfsdf, sum)
dim(merged.fsdf)


# Further process fusions by getting distance, filtering STAR-flagged:
# --------------------------------------------------------------------
# Process into subcategories:
merged.fsdf$fs.set = substr(merged.fsdf$annots, 2, 6)
merged.fsdf = merged.fsdf[merged.fsdf$fs.set %in% c('INTER','INTRA'),]

# Distance:
getPos = function(x){
    x = strsplit(x,":")[[1]][2]
    as.numeric(x) }

merged.fsdf$LeftPosition = sapply(merged.fsdf$LeftBreakpoint, getPos)
merged.fsdf$RightPosition = sapply(merged.fsdf$RightBreakpoint, getPos)
merged.fsdf$Distance = NA
intra.ind = merged.fsdf$fs.set == 'INTRA'
merged.fsdf$Distance[intra.ind] = abs(merged.fsdf$RightPosition - merged.fsdf$LeftPosition)[intra.ind]

# Write the filtered fusions:
write.table(merged.fsdf, procfile, quote=F, row.names=F, col.names=T, sep="\t")


# Repeat for non-merged:
# ----------------------
bcfsdf$fs.set = substr(bcfsdf$annots, 2, 6)
bcfsdf = bcfsdf[bcfsdf$fs.set %in% c('INTER','INTRA'),]
bcfsdf$LeftPosition = sapply(bcfsdf$LeftBreakpoint, getPos)
bcfsdf$RightPosition = sapply(bcfsdf$RightBreakpoint, getPos)
bcfsdf$Distance = NA
intra.ind = (bcfsdf$fs.set == 'INTRA')
bcfsdf$Distance[intra.ind] = abs(bcfsdf$RightPosition - bcfsdf$LeftPosition)[intra.ind]

# Write out the filtered fusions, pre-merge:
write.table(bcfsdf, filtfile, quote=F, row.names=F, col.names=T, sep="\t")