#!/usr/bin/R # ----------------------------------------------------------------- # Load gene sets from GO terms and CKp25 signatures for human data: # Updated: 08/30/2022 # ----------------------------------------------------------------- library(cbrbase) set_proj('DEVTRAJ', 'mosaicism') # Load genes from the CK-p25 signatures: # -------------------------------------- sigdir = paste0(dbdir, 'CKP25/CKp25_bulk_RNAseq/signatures/') siglist = list() fnlist = list.files(path=sigdir, pattern='*.csv') for (fn in fnlist){ tag = sub(".csv","",fn) tag = sub("_l2fc1.*","",tag) df = read.delim(paste0(sigdir, fn), header=T, sep=",") df = df[order(df$padj),] siglist[[tag]] = df$hg_symbol } # Load GO terms: # -------------- termdf = read.delim('Annotation/go_terms.mgi', header=F) names(termdf) = c('source', 'GOID', 'GOTERM') termmap = termdf$GOTERM names(termmap) = termdf$GOID gomap = read.delim(gzfile('Annotation/goa_human_mapping.tsv.gz'), header=F) names(gomap) = c('symbol','GOID') gomap = gomap[gomap$GOID %in% termdf$GOID,] gomap$GOTERM = termmap[gomap$GOID] # Load genes from relevant GO annotations: # ---------------------------------------- kwds = c('cohesin', 'repair', 'cell cycle', 'replication', 'senescence', 'neuron', 'synapse', 'DNA damage', 'double-strand break repair', 'excision repair') subkwds = c(names(siglist), 'repair', 'cell cycle', 'replication', 'senescence', 'neuron', 'synapse', 'DNA damage') coh.genes.hg = c('SMC1A', 'SMC3', 'RAD21', 'STAG1', 'STAG2', 'NIPBL', 'WAPL') # 'RIF1') # removed non-canonical RIF1 full.siglist = list() for (kwd in kwds){ subtermdf = termdf[grep(kwd, termdf$GOTERM),] submap = gomap[gomap$GOID %in% subtermdf$GOID,] genes = sort(unique(submap$symbol)) if (kwd == 'cohesin'){ genes = unique(c(genes, coh.genes.hg)) } siglist[[kwd]] = genes # For each subterm calc scores: terms = unique(submap$GOID) lset = lapply(terms, function(x){ unique(submap$symbol[submap$GOID == x]) }) names(lset) = terms # Remove terms with fewer than N genes: dl = c(lapply(lset, length)) terms = names(dl)[dl > 4] lset = lset[terms] full.siglist[[kwd]] = lset } # Write all of the signature genes for separate compute: sglist = lapply(names(siglist), function(x){ data.frame(kwd=x, gene=siglist[[x]]) }) sgdf = do.call(rbind, sglist) # Filter non-human for the table: gencode_version = 'v28lift37.annotation' anno = read.delim(paste0('Annotation/Tss.', gencode_version, '.bed'), header=F, stringsAsFactors=F) names(anno) <- c('chr','tss','ENSG','type','symbol') sgdf = sgdf[sgdf$gene %in% anno$symbol,] write.table(sgdf, 'Annotation/fusion_signature_lists.tsv', quote=F, row.names=F, sep="\t")