for (CHR in chromosomes){
f1<-as.data.frame(data.table::fread(paste0('/home/robin/dxfuse/DIR/PhasingUKB/UKB_PHASING_EXOME_ARRAY/step4_validation/Rares/chr',CHR,'/benchmark.chr',CHR,'.cut0.001.prob_0.5.fq.frequency.switch.txt.gz'), hea=F))
f1$type<-paste0('shapeit5-chr',CHR)
f2<-as.data.frame(data.table::fread(paste0('/home/robin/dxfuse/DIR/PhasingUKB/UKB_PHASING_EXOME_ARRAY/step4_validation/Rares_beagle/chr',CHR,'benchmark_bealge.chr',CHR,'.fq.frequency.switch.txt.gz'), hea=F))
f2$type<-paste0('beagle-chr',CHR)
f0<-rbind(f0, f1,f2)
}
f0<-data.frame()
for (CHR in chromosomes){
f1<-as.data.frame(data.table::fread(paste0('/home/robin/dxfuse/DIR/PhasingUKB/UKB_PHASING_EXOME_ARRAY/step4_validation/Rares/chr',CHR,'/benchmark.chr',CHR,'.cut0.001.prob_0.5.fq.frequency.switch.txt.gz'), hea=F))
f1$type<-paste0('shapeit5-chr',CHR)
f2<-as.data.frame(data.table::fread(paste0('/home/robin/dxfuse/DIR/PhasingUKB/UKB_PHASING_EXOME_ARRAY/step4_validation/Rares_beagle/chr',CHR,'/benchmark_bealge.chr',CHR,'.fq.frequency.switch.txt.gz'), hea=F))
f2$type<-paste0('beagle-chr',CHR)
f0<-rbind(f0, f1,f2)
}
f0<-f0[complete.cases(f0),]
#f<-as.data.frame(data.table::fread('QC/Rares/SER.chr22.cut0.001.fq.frequency.switch.txt.gz'))
f0$bins<-.bincode(f0$V1, breaks=c(0,1,5,10,20,50,100,200,500,1000,2000,5000,10000,20000,50000,100000,150000, 250000, 500000))
f2<-data.frame()
for (i in 1:max(f0$bins, na.rm=T)){
for (type in unique(f0$type)){
idx = which(f0$bins==i & f0$type==type)
f2<-rbind(f2, data.frame(bin=i, ser=sum(f0$V2[idx])/sum(f0$V3[idx]), count=sum(f0$V2[idx]), type=type))
}}
f2$ser<-f2$ser*100
f2<-read.table('Summary_WES_shapeitVSbeagle_f2.txt', hea=T)
f2<-write.table(f2,'Summary_WES_shapeitVSbeagle_f2.txt', quote=F, col.names=T, row.names=F, sep='\t')
FF<-f2
for (CHR in chromosomes){
f2<-FF[FF$type==paste0('beagle-chr',CHR) | FF$type==paste0('shapeit5-chr',CHR),]
#pdf(paste("figure2c_chr", CHR, ".pdf", sep=""), 10,5)
png(paste("Plots/figure2c_chr", CHR, ".png", sep=""), width = 1000, height = 500)
par(mfrow=c(1,2))
lBIN=c("singleton","2-5","6-10","11-20","21-50","51-100","101-200","201-500","501-1k","1k-2k","2k-5k","5k-10k","10k-20k","20k-50k","50k-100k", "100k-150k", "150k-250k", "250k-500k")
s=8
X = 1:(9+s)
X=1:(length(unique(f2$bin)))
ser_bin=c(0,0.1, 0.2,0.5,1.0,2.0,5.0,10.0,20.0,50.0)
f2$type<-as.character(f2$type)
t<-f2$type[f2$ser==max(f2$ser)][1]
plot(X, log10(f2$ser[f2$type==t]), type="n", pch=20, xlab="Minor Allele Count", ylab="Switch Error Rate (%)", col="black", lwd=2, xaxt='n', yaxt='n', main=paste("A. Switch Error Rate"), xlim=c(min(X), max(X)) )
abline(h=log10(ser_bin), col="lightgrey", lty=2)
abline(v=X, col="lightgrey", lty=2)
text(X, par("usr")[3], labels = lBIN[X], srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=0.75)
#text(X, par("usr")[3], labels = X, srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=0.75)
axis(2, at=log10(ser_bin), label=ser_bin, las=2)
points(X, log10(f2$ser[f2$type==paste0('shapeit5-chr',CHR)]), type="o", pch=20, col=COLdiff[1], lwd=2)
points(X, log10(f2$ser[f2$type==paste0('beagle-chr',CHR)]), type="o", pch=20, col=COLdiff[2], lwd=2)
legend("topright", fill=COLdiff[1:2], legend=c(paste0('shapeit5-chr',CHR),paste0('beagle-chr',CHR)), bg="white", cex=.5)
plot(X, (f2$ser[f2$type==paste0('shapeit5-chr',CHR)] - f2$ser[f2$type==paste0('beagle-chr',CHR)]) * 100.0 / f2$ser[f2$type==paste0('beagle-chr',CHR)], type="n", pch=20, xlab="Minor Allele Count", ylab = "Switch Error Rate Reduction (%)", col="black", lwd=2, xaxt='n', main=paste("B. SER reduction"), xlim=c(min(X), max(X)),yaxt='n')
abline(h=seq(-50, 50, 5), col="lightgrey", lty=2)
abline(v=X, col="lightgrey", lty=2)
#text(X, par("usr")[3], labels = lBIN[X], srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=0.75)
text(X, par("usr")[3], labels = X, srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=0.75)
axis(2, at=seq(-60,60,10), label=seq(-60,60,10), las=2)
points(X, (f2$ser[f2$type==paste0('shapeit5-chr',CHR)] - f2$ser[f2$type==paste0('beagle-chr',CHR)]) * 100.0 / f2$ser[f2$type==paste0('beagle-chr',CHR)], type="o", pch=20, col=COLdiff[2], lwd=2)
#points(X, (SHPser1 - BGLser) * 100.0 / BGLser, type="o", pch=20, col=COLdiff[3], lwd=2)
abline(h=0, col="black")
dev.off()
}
lBIN=c("singleton","2-5","6-10","11-20","21-50","51-100","101-200","201-500","501-1k","1k-2k","2k-5k","5k-10k","10k-20k","20k-50k","50k-100k", "100k-150k", "150k-250k", "250k-500k")
s=8
X = 1:(9+s)
X=1:(length(unique(f2$bin)))
ser_bin=c(0,0.1, 0.2,0.5,1.0,2.0,5.0,10.0,20.0,50.0)
f2$type<-as.character(f2$type)
t<-f2$type[f2$ser==max(f2$ser, na.rm=T)]
plot(X, (f2$ser[f2$type=='Beagle - chr20']), type="n", pch=20, xlab="Minor Allele Count", ylab="Switch Error Rate (%)", col="black", lwd=2, xaxt='n', yaxt='n', main=paste("A. Switch Error Rate"), xlim=c(min(X), max(X)) ,ylim=c(0,20))
abline(v=X, col="lightgrey", lty=2)
text(X, par("usr")[3], labels = lBIN[X], srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=0.75)
axis(2, at=(ser_bin), label=ser_bin, las=2)
c=0
for (CHR in chromosomes){
c=c+1
points(X, (f2$ser[f2$type==paste0('chr',CHR)]), type="o", pch=20, col=COLdiff[c], lwd=2)
}
points(X, (f2$ser[f2$type=='Beagle - chr20']), type="o", pch=20, col="black", lwd=2, xaxt='n', yaxt='n')
legend("topright", fill=c("black", COLdiff[1:length(chromosomes)]), legend=c("Beagle - chr20", paste0('chr',chromosomes)), bg="white", cex=.5)
dev.off()
dev.off()
s=8
X = 1:(9+s)
X=1:(length(unique(f2$bin)))
ser_bin=c(0,0.1, 0.2,0.5,1.0,2.0,5.0,10.0,20.0,50.0)
f2$type<-as.character(f2$type)
t<-f2$type[f2$ser==max(f2$ser, na.rm=T)]
plot(X, (f2$ser[f2$type=='Beagle - chr20']), type="n", pch=20, xlab="Minor Allele Count", ylab="Switch Error Rate (%)", col="black", lwd=2, xaxt='n', yaxt='n', main=paste("A. Switch Error Rate"), xlim=c(min(X), max(X)) ,ylim=c(0,20))
abline(v=X, col="lightgrey", lty=2)
text(X, par("usr")[3], labels = lBIN[X], srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=0.75)
axis(2, at=(ser_bin), label=ser_bin, las=2)
c=0
for (CHR in chromosomes){
c=c+1
points(X, (f2$ser[f2$type==paste0('chr',CHR)]), type="o", pch=20, col=COLdiff[c], lwd=2)
}
points(X, (f2$ser[f2$type=='Beagle - chr20']), type="o", pch=20, col="black", lwd=2, xaxt='n', yaxt='n')
head(f2)
f2<-read.table('Summary_WES_shapeitVSbeagle_f2.txt', hea=T)
head(f2)
unique(f2$type)
library(RColorBrewer)
load(file = "data.Rdata")
setwd('~/Dropbox/SHAPEIT5/Figures/Figure2/')
put.fig.letter <- function(label, location="topleft", x=NULL, y=NULL,
offset=c(0, 0), ...) {
if(length(label) > 1) {
warning("length(label) > 1, using label[1]")
}
if(is.null(x) | is.null(y)) {
coords <- switch(location,
topleft = c(0.015,0.98),
topcenter = c(0.5525,0.98),
topright = c(0.985, 0.98),
bottomleft = c(0.015, 0.02),
bottomcenter = c(0.5525, 0.02),
bottomright = c(0.985, 0.02),
c(0.015, 0.98) )
} else {
coords <- c(x,y)
}
this.x <- grconvertX(coords[1] + offset[1], from="nfc", to="user")
this.y <- grconvertY(coords[2] + offset[2], from="nfc", to="user")
text(labels=label[1], x=this.x, y=this.y, xpd=T, ...)
}
log10.axis <- function(side, at, ...) {
at.minor <- log10(outer(1:9, 10^(min(at):max(at))))
lab <- sapply(at, function(i) as.expression(bquote(10^ .(i))))
axis(side=side, at=at.minor, labels=NA, tcl=par("tcl")*0.5, ...)
#axis(side=side, at=at, labels=lab, ...)
}
load(file = "data.Rdata")
COLpair = brewer.pal(12,"Paired")
COLdiff = brewer.pal(8,"Set1")
#LOAD SER BETWEEN ALL VARIANTS
prefix="../../Data/WGS/Beagle5.4/N147754/benchmark_ukb23352_c20_qc_v1.subset.N147754.";
suffix=".fqf.frequency.switch.txt.gz"
prefix="../../Data/WGS/Shapeit5/N147754/benchmark_ukb23352_c20_qc_v1.subset.N147754.";
suffix=".shapeit5.default.fqf.frequency.switch.txt.gz"
library(RColorBrewer)
COLpair = brewer.pal(12,"Paired")
COLdiff = brewer.pal(8,"Set1")
#LOAD SER BETWEEN ALL VARIANTS
prefix="../../Data/WES/Beagle5.4/SER.chr";
suffix=".fq.frequency.switch.txt.gz"
prefix="../../Data/WES/Shapeit5/SER_v2.chr";
suffix=".cut0.001.prob_0.5.fq.frequency.switch.txt.gz"
load(file = "data.Rdata")
head(BGLser1)
head(BGLser)
head(SHPser1)
SHPser1[1:9]
-(SHPser1[1:9] - BGLser1[1:9]) * 100.0 / BGLser1[1:9]
setwd('~/Dropbox/LCimputation_phasing/step3_exome_array/step2_phasing/Benchmark/')
library(ggplot2)
library(RColorBrewer)
COLpair = brewer.pal(12,"Paired")
COLdiff = brewer.pal(8,"Set1")
COLsp = brewer.pal(10,"Spectral")
COLblues = brewer.pal(11,"RdBu")
setwd('~/Dropbox/LCimputation_phasing/step3_exome_array/step2_phasing/Benchmark/')
library(ggplot2)
library(RColorBrewer)
COLpair = brewer.pal(12,"Paired")
COLdiff = brewer.pal(8,"Set1")
COLsp = brewer.pal(10,"Spectral")
COLblues = brewer.pal(11,"RdBu")
# Frequency
chromosomes=1:22
f0<-data.frame()
probs<-c(0.5,0.6,0.7,0.75,0.8,0.85,0.9,0.95,0.99,1)
for (PROB in probs){
for (CHR in chromosomes){
print(CHR); print(PROB); print('---')
f1<-as.data.frame(data.table::fread(paste0('/home/robin/dxfuse/DIR/PhasingUKB/UKB_PHASING_EXOME_ARRAY/step4_validation/Rares_Trios_only/chr',CHR,'/benchmark.chr',CHR,'.cut0.001.prob_',PROB,'.fq.frequency.switch.txt.gz'), hea=F))
f1$type<-paste0('shapeit5-chr',CHR,'-prob',PROB)
f1$method<-'shapeit5'
f1$prob<-PROB
f0<-rbind(f0, f1)
} }
d<-read.table('/home/robin/dxfuse/DIR/PhasingUKB/UKB_PHASING_EXOME_ARRAY/step4_validation/Rares_Trios_only/chr1/benchmark_Trios.chr1.cut0.001.prob_0.5.fq.frequency.switch.txt.gz')
# Frequency
chromosomes=1:22
f0<-data.frame()
probs<-c(0.5,0.6,0.7,0.75,0.8,0.85,0.9,0.95,0.99,1)
for (PROB in probs){
for (CHR in chromosomes){
print(CHR); print(PROB); print('---')
f1<-as.data.frame(data.table::fread(paste0('/home/robin/dxfuse/DIR/PhasingUKB/UKB_PHASING_EXOME_ARRAY/step4_validation/Rares_Trios_only/chr',CHR,'/benchmark_Trios.chr',CHR,'.cut0.001.prob_',PROB,'.fq.frequency.switch.txt.gz'), hea=F))
f1$type<-paste0('shapeit5-chr',CHR,'-prob',PROB)
f1$method<-'shapeit5'
f1$prob<-PROB
f0<-rbind(f0, f1)
} }
# Frequency
chromosomes=1:22
f0<-data.frame()
probs<-c(0.5,0.6,0.7,0.75,0.8,0.85,0.9,0.95,0.99,1)
for (PROB in probs){
for (CHR in chromosomes){
print(CHR); print(PROB); print('---')
f1<-as.data.frame(data.table::fread(paste0('/home/robin/dxfuse/DIR/PhasingUKB/UKB_PHASING_EXOME_ARRAY/step4_validation/Rares_Trios_only/chr',CHR,'/benchmark_Trios.chr',CHR,'.cut0.001.prob_',PROB,'.fq.frequency.switch.txt.gz'), hea=F))
f1$type<-paste0('shapeit5-chr',CHR,'-prob',PROB)
f1$method<-'shapeit5'
f1$prob<-PROB
f0<-rbind(f0, f1)
} }
f0<-f0[complete.cases(f0),]
f0<-f0[f0$V1!=0,]
f0$V4[f0$V3==0]<-NA
f0$V3[f0$V3==0]<-NA
write.table(f0, 'Summary_WES_f0.txt', quote=F, col.names=T, row.names=F, sep='\t')
f0$bins<-.bincode(f0$V1, breaks=c(0,1,5,10,20,50,100,200,500,1000,2000,5000,10000,20000,50000,100000,150000, 250000, 500000))
f2<-data.frame()
for (i in 1:max(f0$bins, na.rm=T)){
for (prob in unique(f0$prob)){
idx = which(f0$bins==i & f0$prob==prob)
f2<-rbind(f2, data.frame(bin=i, ser=sum(f0$V2[idx], na.rm=T)/sum(f0$V3[idx], na.rm=T), count=sum(f0$V2[idx], na.rm=T), prob=prob))
}}
f2$ser<-f2$ser*100
write.table(f2, 'Summary_WES_f2.txt', quote=F, col.names=T, row.names=F, sep='\t')
S<-data.frame()
for (p in unique(f0$prob)){
for (b in 1:9){
tmp<-f0[f0$prob==p & f0$bins==b,]
s0<-data.frame(prob=p, bin=b, prop=sum(f0$V3[f0$prob==p & f0$bins==b], na.rm=T)/sum(f0$V3[f0$prob==0.5 & f0$bins==b], na.rm=T))
S<-rbind(S, s0)
}
}
S<-S[S$prob!=1,]
write.table(S, 'Summary_WES_S.txt', quote=F, col.names=T, row.names=F, sep='\t')
setwd('~/Dropbox/PofO_dnanexus/step5_input_files/Out/')
library(dplyr)
library(reshape2)
library(ggplot2)
'%ni%'=Negate('%in%')
GENE='ENSG00000053918'; GENE2='KCNQ1';PHENO=50
CUTS<-c('0.01,0.001,0.0001','0.01,0.001','0.01','0.001,0.0001','0.01,0.0001')
TESTS<-c('lof,missense,missense:lof,missense:lof:synonymous','lof,missense,missense:lof','lof,missense','lof','missense','missense:lof')
#cut=CUTS[1]; test=TESTS[1]
D<-data.frame()
for (cut in CUTS){
for (test in TESTS){
parent='Pat'
filename=paste0(GENE2,'/',GENE,'.',PHENO,'.',parent,'.out.',cut,'.',test,'.singleAssoc.txt')
tmp<-as.data.frame(data.table::fread(filename, hea=T))
tmp$parent<-parent
pat<-tmp
parent='Mat'
filename=paste0(GENE2,'/',GENE,'.',PHENO,'.',parent,'.out.',cut,'.',test,'.singleAssoc.txt')
tmp<-as.data.frame(data.table::fread(filename, hea=T))
tmp$parent<-parent
mat<-tmp
d<-pat[,c(3,6,9,10,13)]; colnames(d)<-c('SNP','AC_PAT','BETA_PAT','SE_PAT','P_PAT')
d$AC_MAT<-mat$AC_Allele2[match(d$SNP, mat$MarkerID)]
d$BETA_MAT<-mat$BETA[match(d$SNP, mat$MarkerID)]
d$SE_MAT<-mat$SE[match(d$SNP, mat$MarkerID)]
d$P_MAT<-mat$p.value[match(d$SNP, mat$MarkerID)]
for (m in mat$MarkerID[mat$MarkerID %ni% d$SNP]){
tmp<-data.frame(SNP=m, AC_PAT=NA, BETA_PAT=NA, SE_PAT=NA, P_PAT=NA, AC_MAT=mat$AC_Allele2[mat$MarkerID==m], BETA_MAT=mat$BETA[mat$MarkerID==m], SE_MAT=mat$SE[mat$MarkerID==m], P_MAT=mat$p.value[mat$MarkerID==m])
d<-rbind(d, tmp)
}
d$cut<-cut; d$test<-test
D<-rbind(D,d)
}
}
group<-as.data.frame(t(read.table(paste0('../Imprinted_genes_chunks/',GENE,'.groups'))))
colnames(group)<-c('ID','Group'); group$ID<-as.character(group$ID)
group$ID2<-unlist(lapply(as.character(group$ID), FUN=function(x){paste(unlist(strsplit(x,':'))[1],unlist(strsplit(x,':'))[2],unlist(strsplit(x,':'))[3],unlist(strsplit(x,':'))[4],sep='_')}))
group$BP<-unlist(lapply(as.character(group$ID), FUN=function(x){unlist(strsplit(x,':'))[2]}))
D$annotation<-group$Group[match(D$SNP, group$ID2)]
D<-D[order(D$P_PAT),]
write.table(D, 'KCNQ1/Summary.30750.txt', quote=F, col.names=T, row.names=F, sep='\t')
library(RColorBrewer)
COLpair = brewer.pal(12,"Paired")
COLdiff = brewer.pal(8,"Set1")
COLlabel= c(rgb(0.3,0.9,0.4,0.6), rgb(0.3,0.5,0.4,0.6), rgb(0.3,0.1,0.4,0.6))
Ntrio = read.table("dataWGS_trios.txt", head=FALSE)
Nduo = read.table("dataWGS_duos.txt", head=FALSE)
SER = read.table("dataWGS_ser.txt", head=FALSE)
par(mfrow=c(2,2))
Y = Nduo$V1*100/sum(Nduo$V1)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,5), ylim=c(0,60), col=COLlabel, ylab="Percentage (%)", names.arg=c("p=0/0", "p=0/1", "p=1/1"), xlab="Parental genotype when kid is 0/1", main="a. Singleton phasing using Duos\nWGS data - N=147,754")
text(myBar, Y+2 , paste("n=", Nduo$V1, sep="") ,cex=1)
Y = Ntrio$V1*100/sum(Ntrio$V1)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,5), ylim=c(0,60), col=COLlabel, ylab="Percentage (%)", names.arg=c("f=0/1\nm=0/0", "f=0/0\nm=0/1", "f=0/0\nm=0/0"), xlab="Parental genotypes when kid is 0/1", main="b. Singleton phasing using Trios\nWGS data - N=147,754")
Ntrio = read.table("dataWGS_trios.txt", head=FALSE)
###### COLORS ######
setwd('~/Dropbox/SHAPEIT5/Figures/suppFigure7_singletons/')
COLpair = brewer.pal(12,"Paired")
COLdiff = brewer.pal(8,"Set1")
COLlabel= c(rgb(0.3,0.9,0.4,0.6), rgb(0.3,0.5,0.4,0.6), rgb(0.3,0.1,0.4,0.6))
Ntrio = read.table("dataWGS_trios.txt", head=FALSE)
Nduo = read.table("dataWGS_duos.txt", head=FALSE)
SER = read.table("dataWGS_ser.txt", head=FALSE)
par(mfrow=c(2,2))
Y = Nduo$V1*100/sum(Nduo$V1)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,5), ylim=c(0,60), col=COLlabel, ylab="Percentage (%)", names.arg=c("p=0/0", "p=0/1", "p=1/1"), xlab="Parental genotype when kid is 0/1", main="a. Singleton phasing using Duos\nWGS data - N=147,754")
text(myBar, Y+2 , paste("n=", Nduo$V1, sep="") ,cex=1)
Y = Ntrio$V1*100/sum(Ntrio$V1)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,5), ylim=c(0,60), col=COLlabel, ylab="Percentage (%)", names.arg=c("f=0/1\nm=0/0", "f=0/0\nm=0/1", "f=0/0\nm=0/0"), xlab="Parental genotypes when kid is 0/1", main="b. Singleton phasing using Trios\nWGS data - N=147,754")
text(myBar, Y+2 , paste("n=", Ntrio$V1, sep="") ,cex=1)
Y=as.vector(c(SER[1,2], SER[1, 3]))
Y = Y*100/sum(Y)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,3.5), ylim=c(0,80), col=COLlabel, ylab="Switch error rate (%)", names.arg=c("Incorrect", "Correct"), xlab="Phasing of the singleton", main="c. Singleton phasing using SHAPEIT5\nValidation from Duos\nWGS data - N=147,754")
abline(h=50)
text(myBar, Y+2 , paste("n=", SER[1, 2:3], sep="") ,cex=1)
legend("topleft", bty="n", legend="binomial p-value\n< 2.2e-16")
Y=as.vector(c(SER[2,2], SER[2, 3]))
Y = Y*100/sum(Y)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,3.5), ylim=c(0,80), col=COLlabel, ylab="Switch error rate (%)", names.arg=c("Incorrect", "Correct"), xlab="Phasing of the singleton", main="d. Singleton phasing using SHAPEIT5\nValidation from Trios\nWGS data - N=147,754")
abline(h=50)
text(myBar, Y+2 , paste("n=", SER[2, 2:3], sep="") ,cex=1)
legend("topleft", bty="n", legend="binomial p-value\n< 2.2e-16")
SER
SER[1, 2:3]
SER[1,3]-SER[1,2]
par(mfrow=c(2,2))
Y = Nduo$V1*100/sum(Nduo$V1)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,5), ylim=c(0,60), col=COLlabel, ylab="Percentage (%)", names.arg=c("p=0/0", "p=0/1", "p=1/1"), xlab="Parental genotype when kid is 0/1", main="a. Singleton phasing using Duos\nWGS data - N=147,754")
text(myBar, Y+2 , paste("n=", Nduo$V1, sep="") ,cex=1)
Y = Ntrio$V1*100/sum(Ntrio$V1)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,5), ylim=c(0,60), col=COLlabel, ylab="Percentage (%)", names.arg=c("f=0/1\nm=0/0", "f=0/0\nm=0/1", "f=0/0\nm=0/0"), xlab="Parental genotypes when kid is 0/1", main="b. Singleton phasing using Trios\nWGS data - N=147,754")
text(myBar, Y+2 , paste("n=", Ntrio$V1, sep="") ,cex=1)
Y=as.vector(c(SER[1,2], SER[1, 3]))
Y = Y*100/sum(Y)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,3.5), ylim=c(0,80), col=COLlabel, ylab="Switch error rate (%)", names.arg=c("Correct", "Iorrect"), xlab="Phasing of the singleton", main="c. Singleton phasing using SHAPEIT5\nValidation from Duos\nWGS data - N=147,754")
abline(h=50)
text(myBar, Y+2 , paste("n=", c(SER[1, 2], SER[1,3]-SER[1,2]), sep="") ,cex=1)
legend("topleft", bty="n", legend="binomial p-value\n< 2.2e-16")
Y=as.vector(c(SER[2,2], SER[2, 3]))
Y = Y*100/sum(Y)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,3.5), ylim=c(0,80), col=COLlabel, ylab="Switch error rate (%)", names.arg=c("Incorrect", "Correct"), xlab="Phasing of the singleton", main="d. Singleton phasing using SHAPEIT5\nValidation from Trios\nWGS data - N=147,754")
abline(h=50)
text(myBar, Y+2 , paste("n=", SER[2, 2:3], sep="") ,cex=1)
legend("topleft", bty="n", legend="binomial p-value\n< 2.2e-16")
par(mfrow=c(2,2))
Y = Nduo$V1*100/sum(Nduo$V1)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,5), ylim=c(0,60), col=COLlabel, ylab="Percentage (%)", names.arg=c("p=0/0", "p=0/1", "p=1/1"), xlab="Parental genotype when kid is 0/1", main="a. Singleton phasing using Duos\nWGS data - N=147,754")
text(myBar, Y+2 , paste("n=", Nduo$V1, sep="") ,cex=1)
Y = Ntrio$V1*100/sum(Ntrio$V1)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,5), ylim=c(0,60), col=COLlabel, ylab="Percentage (%)", names.arg=c("f=0/1\nm=0/0", "f=0/0\nm=0/1", "f=0/0\nm=0/0"), xlab="Parental genotypes when kid is 0/1", main="b. Singleton phasing using Trios\nWGS data - N=147,754")
text(myBar, Y+2 , paste("n=", Ntrio$V1, sep="") ,cex=1)
Y=as.vector(c(SER[1,2], SER[1, 3]))
Y = Y*100/sum(Y)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,3.5), ylim=c(0,80), col=COLlabel, ylab="Switch error rate (%)", names.arg=c("Correct", "Inorrect"), xlab="Phasing of the singleton", main="c. Singleton phasing using SHAPEIT5\nValidation from Duos\nWGS data - N=147,754")
abline(h=50)
text(myBar, Y+2 , paste("n=", c(SER[1, 2], SER[1,3]-SER[1,2]), sep="") ,cex=1)
legend("topleft", bty="n", legend="binomial p-value\n< 2.2e-16")
Y=as.vector(c(SER[2,2], SER[2, 3]))
Y = Y*100/sum(Y)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,3.5), ylim=c(0,80), col=COLlabel, ylab="Switch error rate (%)", names.arg=c("Correct", "Incorrect"), xlab="Phasing of the singleton", main="d. Singleton phasing using SHAPEIT5\nValidation from Trios\nWGS data - N=147,754")
abline(h=50)
text(myBar, Y+2 , paste("n=", c(SER[1, 2], SER[1,3]-SER[1,2]), sep="") ,cex=1)
legend("topleft", bty="n", legend="binomial p-value\n< 2.2e-16")
SER
###### COLORS ######
setwd('~/Dropbox/SHAPEIT5/Figures/suppFigure6_sampleSizeWGS/')
library(RColorBrewer)
COLpair = brewer.pal(12,"Paired")
COLdiff = brewer.pal(8,"Set1")
put.fig.letter <- function(label, location="topleft", x=NULL, y=NULL,
offset=c(0, 0), ...) {
if(length(label) > 1) {
warning("length(label) > 1, using label[1]")
}
if(is.null(x) | is.null(y)) {
coords <- switch(location,
topleft = c(0.015,0.98),
topcenter = c(0.5525,0.98),
topright = c(0.985, 0.98),
bottomleft = c(0.015, 0.02),
bottomcenter = c(0.5525, 0.02),
bottomright = c(0.985, 0.02),
c(0.015, 0.98) )
} else {
coords <- c(x,y)
}
this.x <- grconvertX(coords[1] + offset[1], from="nfc", to="user")
this.y <- grconvertY(coords[2] + offset[2], from="nfc", to="user")
text(labels=label[1], x=this.x, y=this.y, xpd=T, ...)
}
letters = c("a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r")
REG=read.table("../../Data/WGS/chr20.size4Mb.txt", head=FALSE)
setwd('~/Dropbox/SHAPEIT5/Figures/Figure2/')
load(file = "data.Rdata")
SHPser0
###### COLORS ######
setwd('~/Dropbox/SHAPEIT5/Figures/suppFigure7_singletons/')
library(RColorBrewer)
COLpair = brewer.pal(12,"Paired")
COLdiff = brewer.pal(8,"Set1")
COLlabel= c(rgb(0.3,0.9,0.4,0.6), rgb(0.3,0.5,0.4,0.6), rgb(0.3,0.1,0.4,0.6))
Ntrio = read.table("dataWGS_trios.txt", head=FALSE)
Nduo = read.table("dataWGS_duos.txt", head=FALSE)
SER = read.table("dataWGS_ser.txt", head=FALSE)
par(mfrow=c(2,2))
Y = Nduo$V1*100/sum(Nduo$V1)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,5), ylim=c(0,60), col=COLlabel, ylab="Percentage (%)", names.arg=c("p=0/0", "p=0/1", "p=1/1"), xlab="Parental genotype when kid is 0/1", main="a. Singleton phasing using Duos\nWGS data - N=147,754")
text(myBar, Y+2 , paste("n=", Nduo$V1, sep="") ,cex=1)
Y = Ntrio$V1*100/sum(Ntrio$V1)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,5), ylim=c(0,60), col=COLlabel, ylab="Percentage (%)", names.arg=c("f=0/1\nm=0/0", "f=0/0\nm=0/1", "f=0/0\nm=0/0"), xlab="Parental genotypes when kid is 0/1", main="b. Singleton phasing using Trios\nWGS data - N=147,754")
text(myBar, Y+2 , paste("n=", Ntrio$V1, sep="") ,cex=1)
Y=as.vector(c(SER[1,2], SER[1,3]-SER[1,2]))
Y = Y*100/sum(Y)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,3.5), ylim=c(0,80), col=COLlabel, ylab="Switch error rate (%)", names.arg=c("Correct", "Incorrect"), xlab="Phasing of the singleton", main="c. Singleton phasing using SHAPEIT5\nValidation from Duos\nWGS data - N=147,754")
abline(h=50)
text(myBar, Y+2 , paste("n=", c(SER[1, 2], SER[1,3]-SER[1,2]), sep="") ,cex=1)
legend("topleft", bty="n", legend="binomial p-value\n< 2.2e-16")
Y=as.vector(c(SER[2,2], SER[2,3]-SER[2,2]))
Y = Y*100/sum(Y)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,3.5), ylim=c(0,80), col=COLlabel, ylab="Switch error rate (%)", names.arg=c("Correct", "Incorrect"), xlab="Phasing of the singleton", main="d. Singleton phasing using SHAPEIT5\nValidation from Trios\nWGS data - N=147,754")
abline(h=50)
text(myBar, Y+2 , paste("n=", c(SER[2, 2], SER[2,3]-SER[2,2]), sep="") ,cex=1)
legend("topleft", bty="n", legend="binomial p-value\n< 2.2e-16")
pdf("figure.pdf", 8,8)
par(mfrow=c(2,2))
Y = Nduo$V1*100/sum(Nduo$V1)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,5), ylim=c(0,60), col=COLlabel, ylab="Percentage (%)", names.arg=c("p=0/0", "p=0/1", "p=1/1"), xlab="Parental genotype when kid is 0/1", main="a. Singleton phasing using Duos\nWGS data - N=147,754")
text(myBar, Y+2 , paste("n=", Nduo$V1, sep="") ,cex=1)
Y = Ntrio$V1*100/sum(Ntrio$V1)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,5), ylim=c(0,60), col=COLlabel, ylab="Percentage (%)", names.arg=c("f=0/1\nm=0/0", "f=0/0\nm=0/1", "f=0/0\nm=0/0"), xlab="Parental genotypes when kid is 0/1", main="b. Singleton phasing using Trios\nWGS data - N=147,754")
text(myBar, Y+2 , paste("n=", Ntrio$V1, sep="") ,cex=1)
Y=as.vector(c(SER[1,2], SER[1,3]-SER[1,2]))
Y = Y*100/sum(Y)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,3.5), ylim=c(0,80), col=COLlabel, ylab="Switch error rate (%)", names.arg=c("Incorrect", "Correct"), xlab="Phasing of the singleton", main="c. Singleton phasing using SHAPEIT5\nValidation from Duos\nWGS data - N=147,754")
abline(h=50)
text(myBar, Y+2 , paste("n=", c(SER[1, 2], SER[1,3]-SER[1,2]), sep="") ,cex=1)
legend("topleft", bty="n", legend="binomial p-value\n< 2.2e-16")
Y=as.vector(c(SER[2,2], SER[2,3]-SER[2,2]))
Y = Y*100/sum(Y)
myBar = barplot(Y,space = 0.5, border=F , xlim=c(0,3.5), ylim=c(0,80), col=COLlabel, ylab="Switch error rate (%)", names.arg=c("Incorrect", "Correct"), xlab="Phasing of the singleton", main="d. Singleton phasing using SHAPEIT5\nValidation from Trios\nWGS data - N=147,754")
abline(h=50)
text(myBar, Y+2 , paste("n=", c(SER[2, 2], SER[2,3]-SER[2,2]), sep="") ,cex=1)
legend("topleft", bty="n", legend="binomial p-value\n< 2.2e-16")
dev.off()
###### COLORS ######
setwd('~/Dropbox/SHAPEIT5/Figures/suppFigure5_switchTriosOnly/')
library(RColorBrewer)
COLpair = brewer.pal(12,"Paired")
COLdiff = brewer.pal(8,"Set1")
REG=read.table("../../Data/WGS/chr20.size4Mb.txt", head=FALSE)
BIN=c(0,1,5,10,20,50,100,200,500,1000,2000,5000,10000,20000,50000,100000, 100000000)
lBIN=c("singleton","2-5","6-10","11-20","21-50","51-100","101-200","201-500","501-1k","1k-2k","2k-5k","5k-10k","10k-20k","20k-50k","50k-100k", "100k+")
nREG=length(REG)
nBIN=length(BIN)
options(scipen=999)
freqSER0 <- function (prefix, suffix, n) {
A = rep(0, n)
B = rep(0, n)
C = 1:n
for (r in 1:nrow(REG)) {
fname=paste(prefix, REG$V4[r], suffix, sep="")
DATA = read.table(fname, head=FALSE)
cat (fname, "\n")
for (l in 1:nrow(DATA)) {
A[DATA$V1[l]] = A[DATA$V1[l]] + DATA$V2[l]
B[DATA$V1[l]] = B[DATA$V1[l]] + DATA$V3[l]
}
}
bins=cut(C, breaks=BIN, labels=1:(nBIN-1))
merged = cbind(A, B, bins)
aggrE = as.vector(by(merged[, 1], merged[, 3], sum))
aggrT = as.vector(by(merged[, 2], merged[, 3], sum))
ser = aggrE*100/aggrT
return (ser);
}
#LOAD SER BETWEEN ALL VARIANTS [ALWAYS *.fqt.* files here!!!!]
prefix="../../Data/WGS/Beagle5.4/N147754/benchmark_ukb23352_c20_qc_v1.subset.N147754.fullchr.";
suffix=".fqt.frequency.switch.txt.gz"
BGLser0=freqSER0(prefix, suffix, 147754);
prefix="../../Data/WGS/Shapeit5/N147754/benchmark_ukb23352_c20_qc_v1.subset.N147754.fullchr.shapeit5.ligated.";
suffix=".fqt.frequency.switch.txt.gz"
SHPser0=freqSER0(prefix, suffix, 147754);
X=1:(nBIN-1)
ser_bin=c(0.1, 0.2,0.5,1.0,2.0,5.0,10.0,20.0,50.0)
par(mfrow=c(1,2))
plot(X, log10(BGLser0), type="n", pch=20, xlab="Minor Allele Count", ylab="Switch Error Rate (%)", col="black", lwd=2, xaxt='n', yaxt='n', main=paste("A. Switch Error Rate\n[N=147,754]", sep=""))
abline(h=log10(ser_bin), col="lightgrey", lty=2)
abline(v=X, col="lightgrey", lty=2)
text(X, par("usr")[3], labels = lBIN[X], srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=0.75)
axis(2, at=log10(ser_bin), label=ser_bin, las=2)
points(X, log10(BGLser0), type="o", pch=20, col="black", lwd=2, xaxt='n', yaxt='n')
points(X, log10(SHPser0), type="o", pch=20, col=COLdiff[2], lwd=2)
legend("topright", fill=c(COLdiff[2], "black"), legend=c("SHAPEIT5", "Beagle5.4"), bg="white")
plot(X, (SHPser0 - BGLser0) * 100.0 / BGLser0, type="n", pch=20, xlab="Minor Allele Count", ylab = "Switch Error Rate Reduction (%)", col="black", lwd=2, xaxt='n', ylim=c(-65, 10), main=paste("B. SER reduction\n[N=147,754]", sep=""), yaxt='n')
abline(h=seq(-50, 50, 5), col="lightgrey", lty=2)
abline(v=X, col="lightgrey", lty=2)
text(X, par("usr")[3], labels = lBIN[X], srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=0.75)
axis(2, at=seq(-70,10,10), label=seq(-70,10,10), las=2)
points(X, (SHPser0 - BGLser0) * 100.0 / BGLser0, type="o", pch=20, col=COLdiff[2], lwd=2)
abline(h=0, col="black")
abline(h=seq(-70, 50, 5), col="lightgrey", lty=2)
pdf("figure.pdf", 10,5)
X=1:(nBIN-1)
ser_bin=c(0.1, 0.2,0.5,1.0,2.0,5.0,10.0,20.0,50.0)
par(mfrow=c(1,2))
plot(X, log10(BGLser0), type="n", pch=20, xlab="Minor Allele Count", ylab="Switch Error Rate (%)", col="black", lwd=2, xaxt='n', yaxt='n', main=paste("A. Switch Error Rate\n[N=147,754]", sep=""))
abline(h=log10(ser_bin), col="lightgrey", lty=2)
abline(v=X, col="lightgrey", lty=2)
text(X, par("usr")[3], labels = lBIN[X], srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=0.75)
axis(2, at=log10(ser_bin), label=ser_bin, las=2)
points(X, log10(BGLser0), type="o", pch=20, col="black", lwd=2, xaxt='n', yaxt='n')
points(X, log10(SHPser0), type="o", pch=20, col=COLdiff[2], lwd=2)
legend("topright", fill=c(COLdiff[2], "black"), legend=c("SHAPEIT5", "Beagle5.4"), bg="white")
plot(X, (SHPser0 - BGLser0) * 100.0 / BGLser0, type="n", pch=20, xlab="Minor Allele Count", ylab = "Switch Error Rate Reduction (%)", col="black", lwd=2, xaxt='n', ylim=c(-65, 10), main=paste("B. SER reduction\n[N=147,754]", sep=""), yaxt='n')
abline(h=seq(-70, 50, 5), col="lightgrey", lty=2)
abline(v=X, col="lightgrey", lty=2)
text(X, par("usr")[3], labels = lBIN[X], srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=0.75)
axis(2, at=seq(-70,10,10), label=seq(-70,10,10), las=2)
points(X, (SHPser0 - BGLser0) * 100.0 / BGLser0, type="o", pch=20, col=COLdiff[2], lwd=2)
abline(h=0, col="black")
dev.off()
