#!/usr/bin/env Rscript # QC_practical_BoulderWorkshop2023.R # Visualise and summarize PLINK output in the Quality Control practical. # Check your working directory getwd() # If not there already, set your working directory to be the folder for this practical setwd('/home/[INSERT USERNAME HERE]/2023/QC') ## Edit [INSERT USERNAME HERE] to the workshop login dir() ## check the contents of the directory ## ==== Practical - R plotting Outline ==== # 2.3 Plotting X chromosome heterozygosity in R # 3.2 Plotting sample call rate in R # 4.2 Plot SNP call rate in R # 4.6 Plot the sample allele frequency spectrum in R # 5.2 Plotting pre and post QC call rates in R # 12.3 Plot PCA eigenvectors in R ## ==== END Practical - R plotting Outline ==== # 2.3 Plotting X chromosome heterozygosity in R # Check the heterozygosity of chromosome X # read in PLINK sexcheck file sex <- read.table("sex.sexcheck", header=T) # view contents head(sex) # Histogram hist(sex$F, xlab="Probablity chrX alleles are from a single ancestor", main = "chrX Heterozygosity") # Boxplot split by reported sex boxplot(split(sex$F, sex$PEDSEX)) # 3.2 Plotting sample call rate in R # Sample Call Rate: individuals missing genotyped data # read in PLINK imiss file missI <- read.table(file="miss.imiss", header=TRUE) # view contents head(missI) # Histogram of F_MISS h1 = hist(missI$F_MISS, breaks=seq(0,.1,.005)) c1 = ifelse(h1$breaks<0.05, "grey", "red")[-length(h1$breaks)] plot(h1, col=c1, xlab="Proportion of missing data: individuals", main ='Sample Call Rate') # look at highest missing SNP frequency max(missI$F_MISS) # Look at top 20 samples with the highest missing SNP frequency head(missI[order(missI$F_MISS,decreasing=T),],20) # Check how many individuals have a call rate less than .95 (or 5% missing genetic data) sum(missI$F_MISS > .05) # 4.2 Plot SNP call rate in R # Genotype Call Rate: SNPs missing information from too many people # read in PLINK lmiss file missL<-read.table(file="miss.lmiss", header=TRUE) # view contents head(missL) # Histogram of F_MISS h2 = hist(missL$F_MISS, breaks=seq(0,.4,.005)) c2 = ifelse(h2$breaks<0.05, "grey", "red")[-length(h2$breaks)] plot(h2, col=c2, xlab="Proportion of missing data: SNPs", main ='Genotype Call Rate') # How many SNPs are missing more than 5% of data sum(missL$F_MISS > .05) # NOTE: It is best to be conservative with SNP call rate filtering, # as imputation will provide many more SNPs than those lost here. # Our imputation will be better quality if we restrict to the highest quality SNPs for imputation # We want to maximize our sample size at this point # So we will clean out the poor quality SNPs first, and then individuals later # 4.6 Plot the sample allele frequency spectrum in R # read in PLINK .frq file maf <- read.table("maf.frq", header =TRUE) # view contents head(maf) ## Histogram h3 = hist(maf$MAF, breaks=seq(0,.5,.005)) c3 = ifelse(h3$breaks>0.01, "grey", "red")[-length(h3$breaks)] plot(h3, col=c3, xlab="Minor allele frequency", main ='MAF') # Number of variants with a MAF < 0.01 sum(maf$MAF < 0.01) # NOTE this number differs from the cc.qc2 output because we are looking at pre-QC SNPs # 5.2 Plotting pre and post QC call rates in R # read in PLINK .imiss file miss2I<-read.table(file="miss2.imiss", header=TRUE) # view contents head(miss2I) ## Histogram h4 = hist(miss2I$F_MISS, breaks=seq(0,.1,.005)) c4=ifelse(h4$breaks<0.05, "grey", "red")[-length(h4$breaks)] par(mfrow=c(2,1)) ## Plot both graphs plot(h4, col=c4, xlab="Proportion of missing data: individuals", main ='AFTER QC') # Compare with the plot from before cleaning the variants plot(h1, col=c1, xlab="Proportion of missing data: individuals", main ='BEFORE QC') par(mfrow=c(1,1)) ## reset plotting # Check how many individuals have a call rate less than .95 after variant QC (or 5% missing genetic data) sum(miss2I$F_MISS > .05) # 12.3 Plot PCA eigenvectors in R pca.cluster = read.table("pca.eigenvec", header=F, stringsAsFactors = F) names(pca.cluster) <- c('FID','IID','PC1','PC2','PC3','PC4') # The script assumes that the ancestry labels in the FID column are always the same. # ASW CEU CHB CHD GIH JPT LWK MEX MKK TSI YRI # Since our FID is all the same for our sample we can check this table(pca.cluster$FID) # Assign colours pca.cluster$color<-"red" pca.cluster$color[which(pca.cluster$FID=="ASW")]<-"darkolivegreen" pca.cluster$color[which(pca.cluster$FID=="CEU")]<-"lightblue" pca.cluster$color[which(pca.cluster$FID=="CHB")]<-"brown" pca.cluster$color[which(pca.cluster$FID=="CHD")]<-"orange" pca.cluster$color[which(pca.cluster$FID=="GIH")]<-"black" pca.cluster$color[which(pca.cluster$FID=="JPT")]<-"purple" pca.cluster$color[which(pca.cluster$FID=="LWK")]<-"magenta" pca.cluster$color[which(pca.cluster$FID=="MEX")]<-"grey50" pca.cluster$color[which(pca.cluster$FID=="MKK")]<-"darkblue" pca.cluster$color[which(pca.cluster$FID=="TSI")]<-"green" pca.cluster$color[which(pca.cluster$FID=="YRI")]<-"yellow" # Different shapes for the points pca.cluster$pchtype<-16 pca.cluster$pchtype[which(pca.cluster$color=="red")]<-19 pca.cluster.sample<-pca.cluster[which(pca.cluster$color=="red"),] # Plot points plot(pca.cluster$PC1, pca.cluster$PC2, col=pca.cluster$color, xlab="Dimension 1", ylab="Dimension 2",pch=pca.cluster$pchtype,cex=0.5) # Add legend legend("topleft", c("CC dataset", "CEU", "CHB", "YRI", "TSI", "JPT", "CHD", "MEX", "GIH", "ASW","LWK", "MKK"), fill=c("red", "lightblue", "brown", "yellow", "green", "purple", "orange", "grey50", "black", "darkolivegreen", "magenta", "darkblue"),cex=0.5) ## Now try plotting PCs 3 and 4!