#!/usr/bin/R # ----------------------------------------------- # Load gene sets from GO terms and p25 signatures # Updated: 08/30/2022 # ----------------------------------------------- library(cbrbase) set_proj('DEVTRAJ', 'mosaicism') source(paste0(sbindir, 'CKP25/load_metadata.R')) # Load genes from the CK-p25 signatures: # -------------------------------------- sigdir = 'CKP25/CKp25_bulk_RNAseq/signatures/' siglist = list() fnlist = list.files(path=sigdir, pattern='*.csv') for (fn in fnlist){ tag = sub(".csv","",fn) tag = sub("_l2fc1.*","",tag) df = read.delim(paste0(sigdir, fn), header=T, sep=",") df = df[order(df$padj),] siglist[[tag]] = df$symbol } # Load histone signature: # ----------------------- histfile = paste0(sigdir, 'histone_genelist_withexpression.tsv') df = read.delim(histfile, sep="\t") df = df[!is.na(df$padj_S2),] df = df[df$padj_S2 < 0.1,] siglist[['histone']] = df$name # Load mouse GO annotations: # -------------------------- termdf = read.delim('Annotation/go_terms.mgi', header=F) names(termdf) = c('source', 'GOID', 'GOTERM') termmap = termdf$GOTERM names(termmap) = termdf$GOID gomap = read.delim(gzfile('Annotation/gene_association.mgi.gz'), header=F, skip=3) gomap = gomap[,c(3:7)] names(gomap) = c('symbol','action','GOID','GOTERM','evcode') gomap = gomap[gomap$symbol %in% rownames(norm),] gomap = gomap[gomap$GOID %in% termdf$GOID,] # Load genes from relevant GO annotations: # ---------------------------------------- kwds = c('cohesin', 'repair', 'cell cycle', 'replication', 'senescence', 'neuron', 'synapse', 'DNA damage', 'double-strand break repair', 'excision repair', 'lamin') subkwds = c(names(siglist), 'repair', 'cell cycle', 'replication', 'senescence', 'neuron', 'synapse', 'DNA damage') coh.genes.mm = c('Smc1a', 'Smc3', 'Rad21', 'Stag1', 'Stag2', 'Nipbl', 'Wapl', 'Rif1') full.siglist = list() for (kwd in kwds){ if (kwd == 'lamin') { kwdsrch = paste0("^", kwd, "[ -]") subtermdf = termdf[grep(kwdsrch, termdf$GOTERM),] kwdsrch = paste0(" ", kwd, "[ -]") subtermdf = rbind(subtermdf, termdf[grep(kwdsrch, termdf$GOTERM),]) } else { subtermdf = termdf[grep(kwd, termdf$GOTERM),] } submap = gomap[gomap$GOID %in% subtermdf$GOID,] genes = unique(submap$symbol) if (kwd == 'cohesin'){ genes = unique(c(genes, coh.genes.mm)) } siglist[[kwd]] = genes # For each subterm calc scores: terms = unique(submap$GOID) lset = lapply(terms, function(x){ unique(submap$symbol[submap$GOID == x]) }) names(lset) = terms # Remove terms with fewer than N genes: dl = c(lapply(lset, length)) terms = names(dl)[dl > 4] lset = lset[terms] full.siglist[[kwd]] = lset } nams = termmap[terms] # Save genes: siglist.rds = paste0(dbdir, 'mouse_genefusions_signature_genesets.Rds') siglist.tsv = paste0(dbdir, 'mouse_genefusions_signature_genesets.tsv.gz') if (!file.exists(siglist.rds)){ sigdf = c() for (kwd in names(siglist)){ df = data.frame(symbol=siglist[[kwd]], kwd=kwd) sigdf = rbind(sigdf, df) } saveRDS(siglist, file=siglist.rds) write.table(sigdf, gzfile(siglist.tsv), quote=F, row.names=F, sep="\t") }