#!/usr/bin/R # ------------------------------------------ # Plot basic stats for the CKP25 mouse data: # ------------------------------------------ library(cbrbase) set_proj('DEVTRAJ', 'mosaicism') library(ggplot2) library(ggpubr) library(ggrepel) library(ggbeeswarm) library(tidyr) library(scales) library(qvalue) source(paste0(sbindir, 'CKP25/load_metadata.R')) imgdir = paste0(img, 'CKP25/') imgdir2 = paste0(imgdir, 'difftl/') imgpref = paste0(imgdir2, 'difftl_') cmd = paste('mkdir -p',imgdir, imgdir2) system(cmd) # --------------------------- # Load in the RNA-level data: # --------------------------- amat = read.delim(paste0(projdir, 'all_aggregated.counts_mat.tsv'), header=F) cn = scan(paste0(projdir, 'all_aggregated.cn.tsv'), 'c', quiet=TRUE) rn = scan(paste0(projdir, 'all_aggregated.rn.tsv'), 'c', quiet=TRUE) sn = scan(paste0(projdir, 'all_aggregated.rn.symb.tsv'), 'c', quiet=TRUE) sn2 = make.unique(sn) # Subset to protein coding genes: anno = read.delim('Annotation/gencode.vM25.basic.annotation.genes.tsv', header=F) names(anno) = c('chr','db','elem','start','end','v1','strand','v2','gene','type','symbol') pcgenes = anno$gene[anno$type == 'protein_coding'] gind = which(rn %in% pcgenes) rn = rn[gind] sn2 = sn2[gind] amat = amat[gind,] amat = as.matrix(amat) rownames(amat) = sn2 colnames(amat) = cn marg = colSums(amat) # Keep only the main cells: cind = which(cn %in% meta$cell) amat = amat[,cind] cn = cn[cind] # ------------------------------------- # Run differential expression analyses: # ------------------------------------- # Differential analysis of cells between timepoints in CKp25 mice # Differential analysis of sub-neuronal cells between timepoints in CKp25 # Differential analysis of sub-neuronal cells by inclusion of p25 (by timepoints) # --------------------------------------------------------------- # Differential analysis of cells between timepoints in CKp25 mice # --------------------------------------------------------------- pvaldf = c() for (celltype in c('Microglia','Oligodendrocyte','Stage 2')){ cellstr = sub(" ","_",tolower(celltype)) ocells = meta$cell[meta$label %in% celltype & meta$genotype == 'CKp25'] cells1 = meta$cell[meta$label %in% celltype & meta$timepoint == '1wk' & meta$genotype == 'CKp25'] cells2 = meta$cell[meta$label %in% celltype & meta$timepoint == '2wk' & meta$genotype == 'CKp25'] smat = amat[,ocells] # Only genes with at least 1% expr pct.cut = 0.01 pct.expr = apply(smat > 0, 1, mean) keep.genes = names(pct.expr)[pct.expr > pct.cut] smat = smat[keep.genes,] print(dim(smat)) # Testable genes: genes = rownames(smat) test.genes = genes test.genes = test.genes[grep("^AC[0-9]*\\.[0-9]*$", test.genes, invert=TRUE)] # Pseudo/unlabeled NTEST = length(test.genes) # Normalize and log1p transform: smarg = apply(smat, 2, sum) nmat = sweep(smat, 2, median(smarg) / smarg, '*') nmat = log(nmat + 1) pvals = rep(1, NTEST) xmean = rep(0, NTEST) ymean = rep(0, NTEST) for (i in 1:NTEST){ gene = test.genes[i] x = nmat[gene, cells1] y = nmat[gene, cells2] xmean[i] = mean(x) ymean[i] = mean(y) pvals[i] = wilcox.test(x, y, alternative='two.sided')$p.value } pvdf = data.frame(label=celltype, testtype='timepoint', symbol=test.genes, mean.in=xmean, mean.out=ymean, pvalue=pvals) pvdf$p.adj = p.adjust(pvals) pvdf$qvalue = qvalue(pvals)$qvalues pvdf$pvalue[is.na(pvdf$pvalue)] = 1 pvdf$p.adj[is.na(pvdf$p.adj)] = 1 pvdf = pvdf[order(pvdf$pvalue), ] # Add to overall: pvaldf = rbind(pvaldf, pvdf) # -------------------- # Make a volcano plot: # -------------------- pc = 0.01 pvdf$col = 1 * (pvdf$qvalue < 0.05) * (1 + 1 * (pvdf$mean.out > pvdf$mean.in)) pvdf$log2fc = log2((pvdf$mean.out + pc)/ (pvdf$mean.in + pc)) labdf = head(pvdf, 20) gplot = ggplot(pvdf, aes(log2fc, -log10(pvalue), color=factor(col))) + geom_point(cex=.5) + theme_pubr() + theme(legend.position='none') + scale_y_continuous(expand=c(0,.1)) + geom_vline(xintercept=0, lty='dashed', col='grey50') + labs(x='log2 FC from 1 to 2 weeks', y='-log10 p-value') + geom_text_repel(data=labdf, aes(log2fc, -log10(pvalue), color=factor(col), label=symbol), size=3, max.overlaps = 20) + scale_color_manual(values=c('0'='grey85','1'=tcols[2],'2'=tcols[6])) ggsave(paste0(imgpref, 'difftl_wilcoxon_timepoints_', cellstr, '_volcano.png'), gplot, dpi=450, units='in', width=3, height=3) # ------------------- # Plot the top genes: # ------------------- NTOP = 20 top.genes = head(pvdf$symbol, NTOP) twide = data.frame(t(nmat[top.genes,])) twide$sample = rownames(twide) tlong = gather(twide, symbol, value, -sample) tlong$timepoint = '1wk' tlong$timepoint [(tlong$sample %in% cells2)] = '2wk' tlong$symbol = sub("\\.","-", tlong$symbol) tlong$symbol = factor(tlong$symbol, levels=top.genes) tlong = tlong[order(tlong$symbol),] tlong$ranked.symbol = paste0(as.numeric(tlong$symbol), '. ',as.character(tlong$symbol)) tlong$ranked.symbol = factor(tlong$ranked.symbol, levels=unique(tlong$ranked.symbol)) NROW = floor(NTOP/5) gplot = ggplot(tlong, aes(timepoint, value, color=timepoint)) + facet_wrap(~ranked.symbol, nrow=NROW) + scale_color_manual(values=c(tsp.col('slateblue'),'slateblue')) + geom_violin(fill=NA) + geom_quasirandom(varwidth=TRUE, size=.4) + labs(x=paste('Time point'), y='Expression (log1p, normalized)', title=paste('Top', NTOP, 'difftl. genes for', celltype)) + theme_pubr() + theme(legend.position='none') ggsave(paste0(imgpref, 'difftl_wilcoxon_timepoints_', cellstr, '_top', NTOP,'.png'), gplot, dpi=450, units='in', width=6, height=4.5 * NROW / 2) # --------------------- # Plot only increasing: # --------------------- spvdf = pvdf[pvdf$mean.out > pvdf$mean.in,] NTOP = 20 top.genes = head(spvdf$symbol, NTOP) twide = data.frame(t(nmat[top.genes,])) twide$sample = rownames(twide) tlong = gather(twide, symbol, value, -sample) tlong$timepoint = '1wk' tlong$timepoint [(tlong$sample %in% cells2)] = '2wk' tlong$symbol = sub("\\.","-", tlong$symbol) tlong$symbol = factor(tlong$symbol, levels=top.genes) tlong = tlong[order(tlong$symbol),] tlong$ranked.symbol = paste0(as.numeric(tlong$symbol), '. ',as.character(tlong$symbol)) tlong$ranked.symbol = factor(tlong$ranked.symbol, levels=unique(tlong$ranked.symbol)) NROW = floor(NTOP/5) gplot = ggplot(tlong, aes(timepoint, value, color=timepoint)) + facet_wrap(~ranked.symbol, nrow=NROW) + scale_color_manual(values=c(tsp.col('slateblue'),'slateblue')) + geom_violin(fill=NA) + geom_quasirandom(varwidth=TRUE, size=.4) + labs(x=paste('Time point'), y='Expression (log1p, normalized)', title=paste('Top', NTOP, 'difftl. genes for', celltype, '(Increasing only)')) + theme_pubr() + theme(legend.position='none') ggsave(paste0(imgpref, 'difftl_wilcoxon_timepoints_incr_', cellstr, '_top', NTOP,'.png'), gplot, dpi=450, units='in', width=6, height=4.5 * NROW / 2) # --------------------- # Plot only decreasing: # --------------------- spvdf = pvdf[pvdf$mean.out < pvdf$mean.in,] NTOP = 20 top.genes = head(spvdf$symbol, NTOP) twide = data.frame(t(nmat[top.genes,])) twide$sample = rownames(twide) tlong = gather(twide, symbol, value, -sample) tlong$timepoint = '1wk' tlong$timepoint [(tlong$sample %in% cells2)] = '2wk' tlong$symbol = sub("\\.","-", tlong$symbol) tlong$symbol = factor(tlong$symbol, levels=top.genes) tlong = tlong[order(tlong$symbol),] tlong$ranked.symbol = paste0(as.numeric(tlong$symbol), '. ',as.character(tlong$symbol)) tlong$ranked.symbol = factor(tlong$ranked.symbol, levels=unique(tlong$ranked.symbol)) NROW = floor(NTOP/5) gplot = ggplot(tlong, aes(timepoint, value, color=timepoint)) + facet_wrap(~ranked.symbol, nrow=NROW) + scale_color_manual(values=c(tsp.col('slateblue'),'slateblue')) + geom_violin(fill=NA) + geom_quasirandom(varwidth=TRUE, size=.4) + labs(x=paste('Time point'), y='Expression (log1p, normalized)', title=paste('Top', NTOP, 'difftl. genes for', celltype, '(Decreasing only)')) + theme_pubr() + theme(legend.position='none') ggsave(paste0(imgpref, 'difftl_wilcoxon_timepoints_decr_', cellstr, '_top', NTOP,'.png'), gplot, dpi=450, units='in', width=6, height=4.5 * NROW / 2) write.table(pvdf, paste0('CKP25/timepoint_difftl_df_', cellstr, '.tsv'), quote=F, row.names=F, col.names=T, sep="\t") } # ---------------------------------------------------- # Differential Ex0-Ex1-Ex2-Ex3 points (in CKp25 mice): # ---------------------------------------------------- ctlist = list( c('Ex0','Stage 1'), c('Stage 1','Stage 2'), c('Ex1','Ex2'), c('Ex2','Ex3'), c('Ex0','Ex1'), c('Ex0','Ex2'), c('Ex0','Ex3')) st1lbl = c('Ex1','Ex2','Ex3') sublabel.col['Stage 1'] = sublabel.col['Ex2'] pvaldf = c() for (ctpair in ctlist){ cellstr = paste(sub(" ","_",tolower(ctpair)), collapse="_vs_") celltype = paste(ctpair, collapse=" vs. ") p1 = ctpair[1] p2 = ctpair[2] if ('Stage 1' == p1){ p1 = st1lbl } if ('Stage 1' == p2){ p2 = st1lbl } ocells = meta$cell[meta$sublabel2 %in% c(p1, p2) & meta$genotype == 'CKp25'] cells1 = meta$cell[meta$sublabel2 %in% p1 & meta$genotype == 'CKp25'] cells2 = meta$cell[meta$sublabel2 %in% p2 & meta$genotype == 'CKp25'] smat = amat[,ocells] # Remove if mean == 0: meanss = apply(smat,1, mean) smat = smat[meanss != 0,] print(dim(smat)) # Testable genes: genes = rownames(smat) test.genes = genes test.genes = test.genes[grep("^AC[0-9]*\\.[0-9]*$", test.genes, invert=TRUE)] # Pseudo/unlabeled # Remove genes with low expr: pctcells = apply(smat[test.genes,] > 0, 1, mean) test.genes = names(pctcells)[pctcells > 0.05] NTEST = length(test.genes) # Normalize and log1p transform: smarg = apply(smat, 2, sum) nmat = sweep(smat, 2, median(smarg) / smarg, '*') nmat = log(nmat + 1) pvals = rep(1, NTEST) xmean = rep(0, NTEST) ymean = rep(0, NTEST) for (i in 1:NTEST){ gene = test.genes[i] x = nmat[gene, cells1] y = nmat[gene, cells2] xmean[i] = mean(x) ymean[i] = mean(y) pvals[i] = wilcox.test(x, y, alternative='two.sided')$p.value } pvdf = data.frame(cell1=ctpair[1], cell2=ctpair[2], testtype='subneuronal', symbol=test.genes, mean.in=xmean, mean.out=ymean, pvalue=pvals) pvdf$p.adj = p.adjust(pvals) pvdf$qvalue = qvalue(pvals)$qvalues pvdf$pvalue[is.na(pvdf$pvalue)] = 1 pvdf$p.adj[is.na(pvdf$p.adj)] = 1 pvdf = pvdf[order(pvdf$pvalue), ] # Add to overall: pvaldf = rbind(pvaldf, pvdf) NTOP = 20 top.genes = head(pvdf$symbol, NTOP) twide = data.frame(t(nmat[top.genes,])) twide$sample = rownames(twide) tlong = gather(twide, symbol, value, -sample) tlong$sublabel = ctpair[1] tlong$sublabel[(tlong$sample %in% cells2)] = ctpair[2] tlong$symbol = sub("\\.","-", tlong$symbol) tlong$symbol = factor(tlong$symbol, levels=top.genes) tlong = tlong[order(tlong$symbol),] tlong$ranked.symbol = paste0(as.numeric(tlong$symbol), '. ',as.character(tlong$symbol)) tlong$ranked.symbol = factor(tlong$ranked.symbol, levels=unique(tlong$ranked.symbol)) sn.cols = sublabel.col[ctpair] NROW = floor(NTOP/5) gplot = ggplot(tlong, aes(sublabel, value, color=sublabel)) + facet_wrap(~ranked.symbol, nrow=NROW) + scale_color_manual(values=sn.cols) + geom_violin(fill=NA) + geom_quasirandom(varwidth=TRUE, size=.4) + labs(x=paste('Sub-celltype'), y='Expression (log1p, normalized)', title=paste('Top', NTOP, 'difftl. genes for', celltype)) + theme_pubr() + theme(legend.position='none') ggsave(paste0(imgpref, 'difftl_wilcoxon_subneuronal_', cellstr, '_top', NTOP,'.png'), gplot, dpi=450, units='in', width=6, height=4.5 * NROW / 2) # Plot only increasing: spvdf = pvdf[pvdf$mean.out > pvdf$mean.in,] NTOP = 20 top.genes = head(spvdf$symbol, NTOP) twide = data.frame(t(nmat[top.genes,])) twide$sample = rownames(twide) tlong = gather(twide, symbol, value, -sample) tlong$sublabel = ctpair[1] tlong$sublabel [(tlong$sample %in% cells2)] = ctpair[2] tlong$symbol = sub("\\.","-", tlong$symbol) tlong$symbol = factor(tlong$symbol, levels=top.genes) tlong = tlong[order(tlong$symbol),] tlong$ranked.symbol = paste0(as.numeric(tlong$symbol), '. ',as.character(tlong$symbol)) tlong$ranked.symbol = factor(tlong$ranked.symbol, levels=unique(tlong$ranked.symbol)) NROW = floor(NTOP/5) gplot = ggplot(tlong, aes(sublabel, value, color=sublabel)) + facet_wrap(~ranked.symbol, nrow=NROW) + scale_color_manual(values=sn.cols) + geom_violin(fill=NA) + geom_quasirandom(varwidth=TRUE, size=.4) + labs(x=paste('Sub-celltype'), y='Expression (log1p, normalized)', title=paste('Top', NTOP, 'difftl. genes for', celltype, '(Increasing only)')) + theme_pubr() + theme(legend.position='none') ggsave(paste0(imgpref, 'difftl_wilcoxon_subneuronal_incr_', cellstr, '_top', NTOP,'.png'), gplot, dpi=450, units='in', width=6, height=4.5 * NROW / 2) # Plot only decreasing: spvdf = pvdf[pvdf$mean.out < pvdf$mean.in,] NTOP = 20 top.genes = head(spvdf$symbol, NTOP) twide = data.frame(t(nmat[top.genes,])) twide$sample = rownames(twide) tlong = gather(twide, symbol, value, -sample) tlong$sublabel = ctpair[1] tlong$sublabel [(tlong$sample %in% cells2)] = ctpair[2] tlong$symbol = sub("\\.","-", tlong$symbol) tlong$symbol = factor(tlong$symbol, levels=top.genes) tlong = tlong[order(tlong$symbol),] tlong$ranked.symbol = paste0(as.numeric(tlong$symbol), '. ',as.character(tlong$symbol)) tlong$ranked.symbol = factor(tlong$ranked.symbol, levels=unique(tlong$ranked.symbol)) NROW = floor(NTOP/5) gplot = ggplot(tlong, aes(sublabel, value, color=sublabel)) + facet_wrap(~ranked.symbol, nrow=NROW) + scale_color_manual(values=sn.cols) + geom_violin(fill=NA) + geom_quasirandom(varwidth=TRUE, size=.4) + labs(x=paste('Sub-celltype'), y='Expression (log1p, normalized)', title=paste('Top', NTOP, 'difftl. genes for', celltype, '(Decreasing only)')) + theme_pubr() + theme(legend.position='none') ggsave(paste0(imgpref, 'difftl_wilcoxon_subneuronal_decr_', cellstr, '_top', NTOP,'.png'), gplot, dpi=450, units='in', width=6, height=4.5 * NROW / 2) } # Write results out: write.table(pvaldf, paste0('CKP25/subneuronal_state_difftl_df.tsv'), quote=F, row.names=F, col.names=T, sep="\t") # Plot specific genes: col.paired = brewer.pal(12, 'Paired') degcols = c('NS'='grey90','Down'=col.paired[2],'Up'=col.paired[6]) pvdf = pvaldf[pvaldf$cell1 == 'Ex0' & pvaldf$cell2 == 'Stage 1',] pvdf = unique(pvdf) pvdf$lr = log2(pvdf$mean.out / pvdf$mean.in) pvdf$col = ifelse(pvdf$qvalue < 0.05, ifelse(pvdf$lr > 0, 'Up', 'Down'), 'NS') gp = ggplot(pvdf, aes(log2(mean.out / mean.in), -log10(pvalue), color=col)) + scale_y_continuous(expand = c(0,0)) + scale_color_manual(values=degcols) + geom_point() + geom_vline(xintercept=0, lty='dotted') + theme_pubr() ggsave(paste0(imgpref, 'sel_volc_ex0stage1.png'), gp, dpi=450, units='in', width=8, height=5) # Plot specific genes: genes = c('H1f0', 'H1f10','H2az2','H2ax', 'Lmnb1') df = data.frame(t(amat[genes,meta$cell])) df$cell = rownames(df) df = gather(df, gene, expr, -cell) df = merge(df, meta[,c('sublabel2','cell','genotype')]) df = df[(df$sublabel2 != 'Batch'),] df$bin = 1 * (df$expr > 0) gplot = ggplot(df, aes(sublabel2,expr)) + facet_wrap(~gene, scales='free_y') + geom_violin() + geom_boxplot() + theme_pubr() + theme(axis.text.x=element_text(angle=90,vjust=.5, hjust=1)) ggsave(paste0(imgpref, 'sel_histone_genes.png'), gplot, dpi=450, units='in', width=8, height=5) # By percent cells expressing; pctdf = aggregate(bin ~ gene + sublabel2, df, mean) gplot = ggplot(pctdf, aes(sublabel2,bin, fill=sublabel2)) + facet_wrap(~gene) + geom_bar(stat='identity') + theme_pubr() + scale_fill_manual(values=sublabel.col) + scale_y_continuous(expand=c(0,0), labels=scales::percent) + labs(x='Sub-celltype',y='Percent of cells with a transcript') + theme(axis.text.x=element_text(angle=90,vjust=.5, hjust=1)) + theme(legend.position='none') ggsave(paste0(imgpref, 'sel_histone_genes_pctbar.png'), gplot, dpi=450, units='in', width=8, height=5) # TODO By genotype: # By percent cells expressing; pctdf = aggregate(bin ~ gene + sublabel2 + genotype, df, mean) gplot = ggplot(pctdf, aes(sublabel2,bin, fill=sublabel2)) + facet_grid(genotype~gene) + geom_bar(stat='identity', position='dodge') + theme_pubr() + scale_fill_manual(values=sublabel.col) + scale_y_continuous(expand=c(0,0), labels=scales::percent) + labs(x='Sub-celltype',y='Percent of cells with a transcript') + theme(axis.text.x=element_text(angle=90,vjust=.5, hjust=1)) + theme(legend.position='none') ggsave(paste0(imgpref, 'sel_histone_genes_pctbar_geno.png'), gplot, dpi=450, units='in', width=8, height=5)