#!/usr/bin/R # ------------------------------------ # Load metadata for the CKP25 project: # ------------------------------------ library(cbrbase) set_proj('DEVTRAJ', 'mosaicism') source(paste0(sbindir, 'CKP25/load_metadata.R')) # Load data directly: amat = read.delim(paste0(projdir, 'all_aggregated.counts_mat.tsv'), header=F) cn = scan(paste0(projdir, 'all_aggregated.cn.tsv'), 'c', quiet=TRUE) rn = scan(paste0(projdir, 'all_aggregated.rn.tsv'), 'c', quiet=TRUE) sn = scan(paste0(projdir, 'all_aggregated.rn.symb.tsv'), 'c', quiet=TRUE) sn2 = make.unique(sn) print(paste("Read in matrix:", paste(dim(amat), collapse=' x '))) # Subset to protein coding genes: anno = read.delim('Annotation/gencode.vM25.basic.annotation.genes.tsv', header=F) names(anno) = c('chr','db','elem','start','end','v1','strand','v2','gene','type','symbol') pcgenes = anno$gene[anno$type == 'protein_coding'] gind = which(rn %in% pcgenes) rn = rn[gind] sn2 = sn2[gind] amat = amat[gind,] amat = as.matrix(amat) rownames(amat) = sn2 colnames(amat) = cn marg = colSums(amat) # Keep only the main cells: cind = which(cn %in% meta$cell) amat = amat[,cind] cn = cn[cind] # Make normalized dataset: norm = sweep(amat, 2, colSums(amat) / median(colSums(amat)), '/') norm = log(norm + 1) genes = rownames(norm)