#!/usr/bin/R # ------------------------------------------ # Address R1 / R2 concerns about stage 1 / stage 2 signatures # - Look at signatures in single-cell (decompose signatures) # - Look specifically at Ccl2 / Cxcl10 / Nfkb / p65 (Rela) # - Compare directly to microglia - show not driven by microglial contamination # - Plot the p25 expression vs. Camk2a expression # Updated: 08/23/21 # ------------------------------------------ library(cbrbase) set_proj('DEVTRAJ', 'mosaicism') library(ggplot2) library(ggpubr) library(ggbeeswarm) library(tidyr) library(scales) library(RColorBrewer) source(paste0(sbindir, 'CKP25/load_metadata.R')) # Directories: imgdir = paste0(img, 'CKP25/') imgdir2 = paste0(imgdir, 'sigs/') imgpref = paste0(imgdir2, 'review_') cmd = paste('mkdir -p',imgdir, imgdir2) system(cmd) # --------------------------- # Load in the RNA-level data: # --------------------------- amat = read.delim(paste0(projdir, 'all_aggregated.counts_mat.tsv'), header=F) cn = scan(paste0(projdir, 'all_aggregated.cn.tsv'), 'c', quiet=TRUE) rn = scan(paste0(projdir, 'all_aggregated.rn.tsv'), 'c', quiet=TRUE) sn = scan(paste0(projdir, 'all_aggregated.rn.symb.tsv'), 'c', quiet=TRUE) sn2 = make.unique(sn) # Subset to protein coding genes: anno = read.delim('Annotation/gencode.vM25.basic.annotation.genes.tsv', header=F) names(anno) = c('chr','db','elem','start','end','v1','strand','v2','gene','type','symbol') pcgenes = anno$gene[anno$type == 'protein_coding'] gind = which(rn %in% pcgenes) rn = rn[gind] sn2 = sn2[gind] amat = amat[gind,] amat = as.matrix(amat) rownames(amat) = sn2 colnames(amat) = cn marg = colSums(amat) # Keep only the main cells: cind = which(cn %in% meta$cell) amat = amat[,cind] cn = cn[cind] # For norm: marg = colSums(amat) norm = sweep(amat, 2, marg / 10000, '/') norm = log(norm + 1) # ------------------------------------------------------------------------------------ # For reviewer 2 - is the p25 (CDK5R1) expression uncoupled from the CAMK2A expression # ------------------------------------------------------------------------------------ df = data.frame(cell=meta$cell, Cdk5r1=norm['Cdk5r1',meta$cell], Cdkn1a=norm['Cdkn1a',meta$cell], Apoe=norm['Apoe',meta$cell], Camk2a=norm['Camk2a', meta$cell]) df = merge(df, meta[,c('cell','sublabel','mouse_id','genotype','has_p25')]) subdf = df[df$Cdk5r1 > 0 & df$Camk2a > 0 & df$genotype == 'CKp25',] g1 = ggplot(subdf, aes(Cdk5r1, Camk2a, color=sublabel)) + scale_color_manual(values=sublabel.col) + facet_wrap(~genotype) + geom_point() + stat_regline_equation(color='black', aes(label = paste(..eq.label.., ..adj.rr.label.., sep = "~~~~"))) + geom_smooth(color='black', method='lm') + theme_pubr() subdf = df[df$Cdk5r1 > 0 & df$Cdkn1a > 0 & df$genotype == 'CKp25',] g2 = ggplot(subdf, aes(Cdk5r1, Cdkn1a, color=sublabel)) + scale_color_manual(values=sublabel.col) + facet_wrap(~genotype) + geom_point() + geom_smooth(color='black', method='lm') + stat_regline_equation(color='black', aes(label = paste(..eq.label.., ..adj.rr.label.., sep = "~~~~"))) + theme_pubr() garr = ggarrange(g1, g2, common.legend=TRUE) ggsave(paste0(imgpref, 'p25_vs_Camk2a_nonzero.png'), garr, dpi=300, units='in', width=8, height=5) # ----------------------- # Load in the signatures: # ----------------------- sigdir = 'CKP25/CKp25_bulk_RNAseq/signatures/' fnlist = list.files(path=sigdir, pattern='*.csv') sigdfs = list() siglist = list() enrdf = c() lcwdf = c() for (fn in fnlist){ tag = sub(".csv","",fn) tag = sub("_l2fc1.*","",tag) df = read.delim(paste0(sigdir, fn), header=T, sep=",") if (tag == 'immune_signature'){ names(df)[1] = 'gene' } df = df[order(df$padj),] write.table(df$symbol, paste0(sigdir, tag, '_msigs.tsv'), quote=F, row.names=F, col.names=F, sep="\t") df = df[df$symbol %in% rownames(amat),] siglist[[tag]] = df$symbol sigdfs[[tag]] = df[,c('symbol','log2FoldChange')] # Intersect against various definitions: cs = log(10000 * colSums(amat[df$symbol,]) / marg + 1) enrdf = rbind(enrdf, data.frame(cell=names(cs), enr=cs, tag=tag)) # Using log2FC as weights: weights = 2^df$log2FoldChange / mean(2^df$log2FoldChange) submat = sweep(amat[df$symbol,], 1, weights,'*') cs = log(10000 * colSums(submat) / marg + 1) lcwdf = rbind(lcwdf, data.frame(cell=names(cs), enr=cs, tag=tag)) } enrdf = merge(enrdf, meta) lcwdf = merge(lcwdf, meta) # -------------------------------------------- # Plot the correlation of the signature genes: # -------------------------------------------- tag = 'Stage2_vs_Baseline' ntop = 500 genes = siglist[[tag]][1:ntop] genes = genes[genes %in% rownames(norm)] submat = norm[genes,] scr = cor(t(submat)) ind = which(apply(is.na(scr), 1, sum) != (ntop - 1)) scr = scr[ind,ind] out = scr # out = reord(scr) # out = out[,rownames(out)] # Plot on margin the relative expr in diff clusters meta$ssl.wk = paste0(meta$short.sublabel, ' (', meta$timepoint, ')') tform = make.tform(meta$short.sublabel, norm=T) tform2 = make.tform(meta$ssl.wk, norm=T) avg.mat = norm[,meta$cell] %*% tform avg.mat.wk = norm[,meta$cell] %*% tform2 sidepanel = avg.mat[colnames(out),] cn = colnames(sidepanel) cn = sort(cn[cn != 'Bat.']) sidepanel = sidepanel[,cn] library(circlize) library(ComplexHeatmap) f1 = colorRamp2(seq(0, max(sidepanel), length = 2), c("#EEEEEE", "red")) ht1 = Heatmap(out, use_raster=T, name='cor', width=20, show_column_names=F) ht2 = Heatmap(sidepanel, row_names_gp = gpar(fontsize = 4), # col=f1, cluster_columns=F, column_split=ifelse(!(cn %in% c('OPC','Mic.','Oli.')), 'Neu', 'Glia'), name = "avg.\nexpr.", width=5) png(paste0(imgpref, 'stage2_corr_avgexpr_n',ntop,'.png'), res=400, units='in', width=20, height=15) ht1 + ht2 dev.off() # Subset to only genes that are pretty much maximal in St2: # --------------------------------------------------------- mx = apply(sidepanel[,(colnames(sidepanel) != 'St.2')], 1, max) ratio = sidepanel[,'St.2'] / (mx + 1e-4) sum(ratio > 1) # 136 of 500. mean(ratio > 1) # 15% (724 of 4621) png(paste0(imgpref, 'hist_ratio_n',ntop,'.png'), res=400, units='in', width=8, height=4) par(yaxs='i', xaxs='i') hist(log2(ratio), 50, border='white', xlab='log2(Avg. Expr. in Stage 2 / Max Avg. in any other celltype)') abline(v=0, lty='dashed',col='red') text(2, 10, sum(ratio >= 1), col='red') text(-2, 10, sum(ratio < 1), col='blue') dev.off() subgenes = rownames(sidepanel)[ratio >= 1] f1 = colorRamp2(seq(0, max(sidepanel), length = 2), c("#EEEEEE", "red")) ht1 = Heatmap(out[subgenes, subgenes], use_raster=T, name='cor', width=20, show_column_names=F) ht2 = Heatmap(sidepanel[subgenes,], # col=f1, row_names_gp = gpar(fontsize = 7), cluster_columns=F, column_split=ifelse(!(cn %in% c('OPC','Mic.','Oli.')), 'Neu', 'Glia'), name = "side", width=5) png(paste0(imgpref, 'stage2_corr_avgexpr_top_inst2_n',ntop,'.png'), res=400, units='in', width=15, height=12) ht1 + ht2 dev.off() # Subset to only genes that are maximal RELATIVE to neurons: # ---------------------------------------------------------- mx = apply(sidepanel[,c('Ex0','Ex1','Ex2','Ex3','In0','In1')], 1, max) ratio = sidepanel[,'St.2'] / (mx + 1e-4) sum(ratio > 1) # 180 of 500. mean(ratio > 1) # 22% (1057 of 4621) png(paste0(imgpref, 'hist_ratio_vex_n',ntop,'.png'), res=400, units='in', width=8, height=4) par(yaxs='i', xaxs='i') hist(log2(ratio), 50, border='white', xlab='log2(Avg. Expr. in Stage 2 / Max Avg. in any other neuronal celltype)') abline(v=0, lty='dashed',col='red') text(2, 10, sum(ratio >= 1), col='red') text(-2, 10, sum(ratio < 1), col='blue') dev.off() ratio2 = sidepanel[,'St.2'] / (sidepanel[,'Ex0'] + 1e-4) sum(ratio2 > 1) # 180 of 500. mean(ratio2 > 1) # 22% (1057 of 4621) png(paste0(imgpref, 'hist_ratio_vex0_n',ntop,'.png'), res=400, units='in', width=8, height=4) par(yaxs='i', xaxs='i') hist(log2(ratio2), 50, border='white', xlab='log2(Avg. Expr. in Stage 2 / Avg. Expr. in Ex0)') abline(v=0, lty='dashed',col='red') text(2, 10, sum(ratio2 >= 1), col='red') text(-2, 10, sum(ratio2 < 1), col='blue') dev.off() subgenes2 = rownames(sidepanel)[ratio >= 1] f1 = colorRamp2(seq(0, max(sidepanel), length = 2), c("#EEEEEE", "red")) ht1 = Heatmap(out[subgenes2, subgenes2], use_raster=T, name='cor', width=20, show_column_names=F) ht2 = Heatmap(sidepanel[subgenes2,], # col=f1, row_names_gp = gpar(fontsize = 5), cluster_columns=F, column_split=ifelse(!(cn %in% c('OPC','Mic.','Oli.')), 'Neu', 'Glia'), name = "side", width=5) png(paste0(imgpref, 'stage2_corr_avgexpr_top_vex_inst2_n',ntop,'.png'), res=400, units='in', width=15, height=12) ht1 + ht2 dev.off() # Difference ~ 300 diff = subgenes2[!(subgenes2 %in% subgenes)] # Look at the immune / inflammatory signatures in these cell types: # ----------------------------------------------------------------- inflamm_sig = c('Ccl2','Ccl20','Cxcl10','Cxcl16','Il1a','Il6','Il15','Il18', 'Ifitm3','Ifnar1','Ifngr1','Irf7','Irf9','Jak3','Myd88', 'Nfkbia','Rela','Relb','Socs3','Tir3','Traf2', 'Psmb8','Psmb9', 'Cgas','Ddx41','Ddx58','Ifih1','Ifit1', 'Isg20','Oasl2','Pkr','Zbp1', 'Cdk5','Cdk5r1','Cdkn1a','Cdkn2a') genes = rownames(avg.mat) igenes = genes[genes %in% inflamm_sig] state = rep("", length(igenes)) state[igenes %in% c('Ccl2','Ccl20','Cxcl10','Cxcl16','Il1a','Il6','Il15','Il18')] = 'Cytokine' state[igenes %in% c('Ifitm3','Ifnar1','Ifngr1','Irf7','Irf9','Jak3','Myd88', 'Nfkbia','Rela','Relb','Socs3','Tir3','Traf2')] = 'ImmuneReg' state[igenes %in% c('Psmb8','Psmb9')] = 'Inflam.' state[igenes %in% c('Cgas','Ddx41','Ddx58','Ifih1','Ifit1', 'Isg20','Oasl2','Pkr','Zbp1')] = 'Nuc. Acid Sens.' state[igenes %in% c('Cdk5','Cdk5r1','Cdkn1a','Cdkn2a')] = 'Cyclin' # CKp25 cells only: ind = meta$genotype == 'CKp25' tform2 = make.tform(meta$ssl.wk[ind], norm=T) avg.mat.wk = norm[,meta$cell[ind]] %*% tform2 imat = avg.mat.wk[igenes,] cn = colnames(imat) cn = sort(cn[!(cn %in% c('Bat. (1wk)','Bat. (2wk)'))]) imat = imat[,cn] f1 = colorRamp2(seq(0, max(imat), length = 2), c("#EEEEEE", "red")) ct = rep('Glial', length(cn)) ct[c(grep("^Ex", cn), grep("^In", cn))] = "Neuronal" ct[c(grep("^St", cn))] = "Neuronal" library(viridis) ht2 = Heatmap(imat, name='avg. expr.', col=viridis(100), cluster_columns=F, column_split=ct, row_split=state) png(paste0(imgpref, 'major_stage2_avgheatmap.png'), res=400, units='in', width=6, height=6) ht2 dev.off() # Violin plots of the same genes: idf = data.frame(norm[igenes,meta$cell]) colnames(idf) = meta$cell idf$gene = rownames(idf) ilong = gather(idf, cell, val, -gene) ilong = merge(ilong, meta[,c('cell','short.sublabel')]) ggplot(ilong, aes(short.sublabel, val)) + facet_wrap(~gene) + geom_violin() # Plot the margins of the expression matrix: # ------------------------------------------ png(paste0(imgpref, 'expr_margins.png'), res=400, units='in', width=9, height=4) par(yaxs='i', xaxs='i') par(mar=c(4,4,1, 1)) par(mfrow=c(1,2)) hist(colSums(amat > 0), 50, border='white', xlab='Number of expressed genes', main='') mx = round(mean(colSums(amat > 0)),2) abline(v=mx, lty='dashed',col='red') text(mx * 1.2, 40, mx, col='red', adj=0) hist(colSums(amat), 50, border='white', xlab='Number of reads', main='') mx = round(mean(colSums(amat)),2) abline(v=mx, lty='dashed',col='red') text(mx * 1.2, 40, mx, col='red', adj=0) dev.off()