#!/usr/bin/R # ---------------------------------------------- # Join the processed gene fusion datasets # Updated: 01/27/22 # ---------------------------------------------- library(cbrbase) set_proj('DEVTRAJ') # Libraries: library(tidyr) library(ggplot2) library(ggpubr) library(RColorBrewer) # Load in and process data (saves to matrices): # project = 'ALZ_repl' project = 'ALZ' commandArgs <- function(){ c(project)} source(paste0(bindir, 'mosaicism/load_metadata.R')) # Load in the two cohorts of candidate gene fusion counts: # -------------------------------------------------------- fs1dir = paste0(dbdir,'variants/', 'ALZ', '/fusion/') fs2dir = paste0(dbdir,'variants/', 'ALZ_repl', '/fusion/') # filesuff = 'maxsensitivity_prediction_numbers_nochrM' # Old filtering, insufficient filesuff = 'maxsensitivity_prediction_numbers_wmeta' o1file = paste0(fs1dir, 'ALZ', '_', filesuff, '.tsv') o2file = paste0(fs2dir, 'ALZ_repl', '_', filesuff, '.tsv') ct1df = read.delim(o1file, header=T) ct2df = read.delim(o2file, header=T) ct1df$cohort = 'ALZ' ct2df$cohort = 'ALZ_repl' # Fix the plate mapping: pmap = read.delim('ALZ_repl_plate_mapping.tsv', header=T) ct1df$projid = ct1df$plate ct1df$platform = '40SE' ct2df = merge(ct2df, pmap[,c('plate','projid')], all.x=TRUE) ct2df$platform = '150PE' ct2df$platform[ct2df$plate %in% c('MM','NN','OO','PP')] = '250PE' ctdf = rbind(ct1df, ct2df[,colnames(ct1df)]) countcutoff = 25000 sub.ctdf = ctdf[ctdf$n_counts >= countcutoff,] metamap = unique(ctdf[,c('plate','projid','platform','cohort')]) # Merge the raw fusion datasets as well: # -------------------------------------------------- fssuff = 'maxsensitivity_abridged_fusions_filtered' fs1file = paste0(fs1dir, 'ALZ', '_', fssuff, '.tsv') fs2file = paste0(fs2dir, 'ALZ_repl', '_', fssuff, '.tsv') fs1df = read.delim(fs1file, header=T) fs2df = read.delim(fs2file, header=T) fsdf = rbind(fs1df, fs2df) colnames(fsdf)[colnames(fsdf) == 'projid'] = 'plate' fsdf = merge(fsdf, metamap, all.x=TRUE) # Write out both of these tables: # ------------------------------- outpref = paste0(dbdir,'variants/ALZ_joint_fusion_datasets_') count.tsv = paste0(outpref, filesuff, '.tsv') fusion.tsv = paste0(outpref, fssuff, '.tsv') write.table(ctdf, count.tsv, quote=F, row.names=F, col.names=T, sep="\t") write.table(fsdf, fusion.tsv, quote=F, row.names=F, col.names=T, sep="\t")