#!/usr/bin/R # ------------------------------------------------------ # Load, process, flag, and filter fusions for 10x human: # - follows analysis/filtering done for P301s/smart-seq # Updated: 02/02/23 # ------------------------------------------------------ library(cbrbase) library(ggplot2) library(ggpubr) set_proj('DEVTRAJ', 'mosaicism') source(paste0(bindir, 'mosaicism/fusions_AD430/load_metadata.R')) source(paste0(bindir, 'multiRegion/auxiliary_plotting_settings.R')) # Directories: plotdir = paste0(imgdir, 'fusions/') imgpref = paste0(plotdir, 'AD430_fusions_') cmd = paste('mkdir -p', plotdir) system(cmd) # Load the preliminary fusion files: # ---------------------------------- fsdir = paste0(dbdir, 'fusions_AD430/') prefix = 'AD430.aggregated.fusion_predictions' predfile = paste0(fsdir, prefix, '.tsv') bcfile = paste0(fsdir, prefix, '.byread.withBC.tsv') procfile = paste0(fsdir, prefix, '.processed.tsv') filtfile = paste0(fsdir, prefix, '.filtered.nonmerge.tsv') fsdf = read.delim(predfile, header=T) bcdf = read.delim(bcfile, header=T) fsdf$fsid = with(fsdf, paste(fusionID, projid, sample, lane, repl, sep="-")) bcdf$fsid = with(bcdf, paste(fusionID, projid, sample, lane, repl, sep="-")) # Flag reads mapped to multiple fusion events: # -------------------------------------------- flag.read.df = agg.rename(uqID ~ Read, bcdf, length, 'nfusion') flag.reads = flag.read.df$Read[flag.read.df$nfusion > 1] bcdf$flag.nonunique = bcdf$Read %in% flag.reads # Count how many reads to remove: flag.nu = aggregate(flag.nonunique ~ fsid + JunctionReadCount, bcdf, sum) print(paste("[STATUS] Flagged", length(flag.reads), "reads")) # Check for repeated fusions across cells: # ---------------------------------------- # First, flag breakpoints that repeat across cells or mice: # Right: agg.bcdf = aggregate(cbind(sample, lane, CB, UMI) ~ RightBreakpoint, bcdf, function(x){length(unique(x))}) agg.bcdf$flag = (agg.bcdf$sample > 1) | (agg.bcdf$CB > 1) flag.right = agg.bcdf$RightBreakpoint[agg.bcdf$flag] # Left: agg.bcdf = aggregate(cbind(sample, lane, CB, UMI) ~ LeftBreakpoint, bcdf, function(x){length(unique(x))}) agg.bcdf$flag = (agg.bcdf$sample > 1) | (agg.bcdf$CB > 1) flag.left = agg.bcdf$LeftBreakpoint[agg.bcdf$flag] # Add to bcdf: bcdf$flag.bkpt = (bcdf$LeftBreakpoint %in% flag.left) | (bcdf$RightBreakpoint %in% flag.right) print(paste("[STATUS] Flagged", length(flag.right), '(right) and', length(flag.left), "(left) repeated breakpoints")) # Check for highly promiscuous/recurring genes: # --------------------------------------------- fsdf$LeftSymbol = sub("\\^.*","", fsdf$LeftGene) fsdf$RightSymbol = sub("\\^.*","", fsdf$RightGene) # Merge lanes before call promiscuous breaks: sample.fsdf = unique(fsdf[,c('sample','RightBreakpoint','LeftBreakpoint', 'RightSymbol','LeftSymbol')]) # Plot the number of called fusions in the dataset: fs.genes = c(sample.fsdf$RightSymbol, sample.fsdf$LeftSymbol) countdf = data.frame(table(fs.genes)) colnames(countdf)[1] = 'gene' countdf = countdf[order(countdf$Freq, decreasing=T),] countdf$gene = factor(countdf$gene, levels=rev(countdf$gene)) topctdf = head(countdf, 25) gp = ggplot(topctdf, aes(gene, Freq, label=Freq)) + geom_bar(stat='identity', fill='grey75') + scale_y_continuous(expand=c(0,0), labels=scales::comma) + geom_text(aes(hjust=ifelse(Freq > 1000, 1.1, -.1))) + theme_pubr() + labs(x='Gene', y='Number of candidate fusions', title='Top genes in fusion candidates') + coord_flip() saveGGplot(gp, paste0(imgpref, 'topgenes_nhits_barplot'), w=5, h=4.5) # Breakpoints are also problematic (merge lanes before calling): fs.bks = c(sample.fsdf$RightBreakpoint, sample.fsdf$LeftBreakpoint) bkdf = data.frame(table(fs.bks)) colnames(bkdf)[1] = 'bkpt' bkdf = bkdf[order(bkdf$Freq, decreasing=T),] bkdf$bkpt = factor(bkdf$bkpt, levels=rev(bkdf$bkpt)) # Allow bkpt merge by celltype topctdf = head(bkdf, 25) gp = ggplot(topctdf, aes(bkpt, Freq, label=Freq)) + geom_bar(stat='identity', fill='grey75') + scale_y_continuous(expand=c(0,0), labels=scales::comma) + geom_text(aes(hjust=ifelse(Freq > 1000, 1.1, -.1))) + theme_pubr() + labs(x='Breakpoint', y='Number of candidate fusions', title='Top breakpoints') + coord_flip() saveGGplot(gp, paste0(imgpref, 'topbreakpoints_nhits_barplot'), w=5, h=4.5) # Subset the fusion candidates to only high confidence ones: # ---------------------------------------------------------- # First keep only unique breakpoints to eliminate repetitive genes: # NOTE: Almost, except for IGH/IGL, MALAT1, SNGH14, and mito/ribo genes rep.breaks = as.character(bkdf$bkpt[bkdf$Freq >= 2]) rep.breaks = unique(c(rep.breaks, flag.right, flag.left)) sub.fsdf = fsdf[!((fsdf$RightBreakpoint %in% rep.breaks) | (fsdf$LeftBreakpoint %in% rep.breaks)),] # Also remove the non-unique read mappings (although taken care of by bkpts) sub.fsdf = merge(sub.fsdf, flag.nu, all.x=TRUE) ind = (sub.fsdf$JunctionReadCount - sub.fsdf$flag.nonunique) > 0 sub.fsdf = sub.fsdf[ind,] # Re-count unique genes: fs.genes = c(sub.fsdf$RightSymbol, sub.fsdf$LeftSymbol) countdf = data.frame(table(fs.genes)) colnames(countdf)[1] = 'gene' countdf = countdf[order(countdf$Freq, decreasing=T),] countdf$gene = factor(countdf$gene, levels=rev(countdf$gene)) # Plot post-filter: topctdf = head(countdf, 25) gp = ggplot(topctdf, aes(gene, Freq, label=Freq)) + geom_bar(stat='identity', fill='grey75') + scale_y_continuous(expand=c(0,0), labels=scales::comma) + geom_text(aes(hjust=ifelse(Freq > 1000, 1.1, -.1))) + theme_pubr() + labs(x='Gene', y='Number of candidate fusions', title='Top genes in fusion candidates (unique breakpoints)') + coord_flip() saveGGplot(gp, paste0(imgpref, 'topgenes_nhits_uqbkpt_barplot'), w=5, h=4.5) # Next flag the other artifact-prone genes: uqsymbols = unique(fs.genes) mito.genes = uqsymbols[c( grep("^MT-", uqsymbols), grep("^MTND", uqsymbols), grep("^MTRNR", uqsymbols))] ribo.genes = unique(uqsymbols[c( grep("^RPL", uqsymbols), grep("^RPS", uqsymbols), grep("^RP[0-9]", uqsymbols))]) pseudo.genes = unique(uqsymbols[c( grep("^A[CPLF][0-9]*\\.[0-9]", uqsymbols))]) ig.loc = uqsymbols[grep("^IG[HL].*ext",uqsymbols)] flagged.symbols = c(mito.genes, ribo.genes, pseudo.genes, ig.loc, 'MALAT1', 'SNHG14') sub.fsdf$LeftFlagged = sub.fsdf$LeftSymbol %in% flagged.symbols sub.fsdf$RightFlagged = sub.fsdf$RightSymbol %in% flagged.symbols # Subset again: sub.fsdf = sub.fsdf[(!sub.fsdf$LeftFlagged) & (!sub.fsdf$RightFlagged),] fs.genes = c(sub.fsdf$RightSymbol, sub.fsdf$LeftSymbol) countdf = data.frame(table(fs.genes)) colnames(countdf)[1] = 'gene' countdf = countdf[order(countdf$Freq, decreasing=T),] countdf$gene = factor(countdf$gene, levels=rev(countdf$gene)) # Annotate and merge to sample level: # ----------------------------------- # Add the barcodes for each cell: bcfsdf = merge(sub.fsdf, unique(bcdf[,c('fsid','CB')]), all.x=TRUE) nrow(sub.fsdf) == nrow(bcfsdf) # Aggregate fusions to sample level, merging lanes: merged.fsdf = aggregate(JunctionReadCount ~ LeftBreakpoint + RightBreakpoint + LeftSymbol + RightSymbol + SpliceType + projid + sample + CB + annots, bcfsdf, sum) dim(merged.fsdf) # Further process fusions by getting distance, filtering STAR-flagged: # -------------------------------------------------------------------- # Process into subcategories: merged.fsdf$fs.set = substr(merged.fsdf$annots, 2, 6) merged.fsdf = merged.fsdf[merged.fsdf$fs.set %in% c('INTER','INTRA'),] # Distance: getPos = function(x){ x = strsplit(x,":")[[1]][2] as.numeric(x) } merged.fsdf$LeftPosition = sapply(merged.fsdf$LeftBreakpoint, getPos) merged.fsdf$RightPosition = sapply(merged.fsdf$RightBreakpoint, getPos) merged.fsdf$Distance = NA intra.ind = merged.fsdf$fs.set == 'INTRA' merged.fsdf$Distance[intra.ind] = abs(merged.fsdf$RightPosition - merged.fsdf$LeftPosition)[intra.ind] # Write the filtered fusions: write.table(merged.fsdf, procfile, quote=F, row.names=F, col.names=T, sep="\t") # Repeat for non-merged: # ---------------------- bcfsdf$fs.set = substr(bcfsdf$annots, 2, 6) bcfsdf = bcfsdf[bcfsdf$fs.set %in% c('INTER','INTRA'),] bcfsdf$LeftPosition = sapply(bcfsdf$LeftBreakpoint, getPos) bcfsdf$RightPosition = sapply(bcfsdf$RightBreakpoint, getPos) bcfsdf$Distance = NA intra.ind = (bcfsdf$fs.set == 'INTRA') bcfsdf$Distance[intra.ind] = abs(bcfsdf$RightPosition - bcfsdf$LeftPosition)[intra.ind] # Write out the filtered fusions, pre-merge: write.table(bcfsdf, filtfile, quote=F, row.names=F, col.names=T, sep="\t")