#!/usr/bin/env Rscript

# QC_practical_BoulderWorkshop2023.R

# Visualise and summarize PLINK output in the Quality Control practical.


# Check your working directory
getwd()

# If not there already, set your working directory to be the folder for this practical

setwd('/home/[INSERT USERNAME HERE]/2023/QC') ## Edit [INSERT USERNAME HERE] to the workshop login

dir() ## check the contents of the directory


## ==== Practical - R plotting Outline ====

#  2.3 Plotting X chromosome heterozygosity in R

#  3.2 Plotting sample call rate in R

#  4.2 Plot SNP call rate in R 

#  4.6 Plot the sample allele frequency spectrum in R

#  5.2 Plotting pre and post QC call rates in R

#  12.3 Plot PCA eigenvectors in R

## ==== END Practical - R plotting Outline ====


#  2.3 Plotting X chromosome heterozygosity in R

# Check the heterozygosity of chromosome X

# read in PLINK sexcheck file
sex <- read.table("sex.sexcheck", header=T)

# view contents
head(sex)

# Histogram
hist(sex$F, xlab="Probablity chrX alleles are from a single ancestor", main = "chrX Heterozygosity")

# Boxplot split by reported sex
boxplot(split(sex$F, sex$PEDSEX))


#  3.2 Plotting sample call rate in R

# Sample Call Rate: individuals missing genotyped data

# read in PLINK imiss file
missI <- read.table(file="miss.imiss", header=TRUE)

# view contents
head(missI)

# Histogram of F_MISS
h1 = hist(missI$F_MISS, breaks=seq(0,.1,.005))
c1 = ifelse(h1$breaks<0.05, "grey", "red")[-length(h1$breaks)]
plot(h1, col=c1, xlab="Proportion of missing data: individuals", main ='Sample Call Rate')

# look at highest missing SNP frequency
max(missI$F_MISS)

# Look at top 20 samples with the highest missing SNP frequency
head(missI[order(missI$F_MISS,decreasing=T),],20)

# Check how many individuals have a call rate less than .95 (or 5% missing genetic data)
sum(missI$F_MISS > .05)


#  4.2 Plot SNP call rate in R 

# Genotype Call Rate: SNPs missing information from too many people

# read in PLINK lmiss file
missL<-read.table(file="miss.lmiss", header=TRUE)

# view contents
head(missL)

# Histogram of F_MISS
h2 = hist(missL$F_MISS, breaks=seq(0,.4,.005))
c2 = ifelse(h2$breaks<0.05, "grey", "red")[-length(h2$breaks)]
plot(h2, col=c2, xlab="Proportion of missing data: SNPs", main ='Genotype Call Rate')

# How many SNPs are missing more than 5% of data
sum(missL$F_MISS > .05)

# NOTE: It is best to be conservative with SNP call rate filtering, 
# as imputation will provide many more SNPs than those lost here.
# Our imputation will be better quality if we restrict to the highest quality SNPs for imputation
# We want to maximize our sample size at this point
# So we will clean out the poor quality SNPs first, and then individuals later


#  4.6 Plot the sample allele frequency spectrum in R

# read in PLINK .frq file
maf <- read.table("maf.frq", header =TRUE)

# view contents
head(maf)

## Histogram
h3 = hist(maf$MAF, breaks=seq(0,.5,.005))
c3 = ifelse(h3$breaks>0.01, "grey", "red")[-length(h3$breaks)]
plot(h3, col=c3, xlab="Minor allele frequency", main ='MAF')

# Number of variants with a MAF < 0.01
sum(maf$MAF < 0.01)
# NOTE this number differs from the cc.qc2 output because we are looking at pre-QC SNPs


#  5.2 Plotting pre and post QC call rates in R

# read in PLINK .imiss file
miss2I<-read.table(file="miss2.imiss", header=TRUE)

# view contents
head(miss2I)

## Histogram
h4 = hist(miss2I$F_MISS, breaks=seq(0,.1,.005))
c4=ifelse(h4$breaks<0.05, "grey", "red")[-length(h4$breaks)]

par(mfrow=c(2,1)) ## Plot both graphs

plot(h4, col=c4, xlab="Proportion of missing data: individuals", main ='AFTER QC')

# Compare with the plot from before cleaning the variants
plot(h1, col=c1, xlab="Proportion of missing data: individuals", main ='BEFORE QC')

par(mfrow=c(1,1)) ## reset plotting

# Check how many individuals have a call rate less than .95 after variant QC (or 5% missing genetic data)
sum(miss2I$F_MISS > .05)


#  12.3 Plot PCA eigenvectors in R

pca.cluster = read.table("pca.eigenvec", header=F, stringsAsFactors = F)
names(pca.cluster) <- c('FID','IID','PC1','PC2','PC3','PC4')

# The script assumes that the ancestry labels in the FID column are always the same.
# ASW   CEU   CHB   CHD   GIH   JPT   LWK  MEX   MKK   TSI   YRI

# Since our FID is all the same for our sample we can check this
table(pca.cluster$FID)

# Assign colours
pca.cluster$color<-"red"
pca.cluster$color[which(pca.cluster$FID=="ASW")]<-"darkolivegreen"
pca.cluster$color[which(pca.cluster$FID=="CEU")]<-"lightblue"
pca.cluster$color[which(pca.cluster$FID=="CHB")]<-"brown"
pca.cluster$color[which(pca.cluster$FID=="CHD")]<-"orange"
pca.cluster$color[which(pca.cluster$FID=="GIH")]<-"black"
pca.cluster$color[which(pca.cluster$FID=="JPT")]<-"purple"
pca.cluster$color[which(pca.cluster$FID=="LWK")]<-"magenta"
pca.cluster$color[which(pca.cluster$FID=="MEX")]<-"grey50"
pca.cluster$color[which(pca.cluster$FID=="MKK")]<-"darkblue"
pca.cluster$color[which(pca.cluster$FID=="TSI")]<-"green"
pca.cluster$color[which(pca.cluster$FID=="YRI")]<-"yellow"

# Different shapes for the points
pca.cluster$pchtype<-16
pca.cluster$pchtype[which(pca.cluster$color=="red")]<-19

pca.cluster.sample<-pca.cluster[which(pca.cluster$color=="red"),]

# Plot points
plot(pca.cluster$PC1, pca.cluster$PC2, col=pca.cluster$color, xlab="Dimension 1", ylab="Dimension 2",pch=pca.cluster$pchtype,cex=0.5)

# Add legend
legend("topleft", c("CC dataset", "CEU", "CHB", "YRI", "TSI", "JPT", "CHD", "MEX", "GIH", "ASW","LWK", "MKK"), fill=c("red", "lightblue", "brown", "yellow", "green", "purple", "orange", "grey50", "black", "darkolivegreen", "magenta", "darkblue"),cex=0.5)


## Now try plotting PCs 3 and 4!