## PLINK manual editing tutorial script

# manual editing: manipulating PLINK files outside of PLINK
#  - Best avoided if PLINK can run the operation

## Contents:
## 1 - Manually updating .fam file
## 2 - Manually updating .bim file

## Files available at: /web/personal/howrigan/tutorial


## 1 - Manually updating .fam file


## Creating a typical VCF ID from the genotyping platform

## creating hypothetical chipwell identifiers from a genotyping plate
## using seq() to create a sequence, sprintf() to have leading zeros, and paste0() to combine into a single character
rows <- paste0('R',sprintf("%02d",seq(1,12,1)))
cols <- paste0('C',sprintf("%02d",seq(1,8,1)))

## create empty individual ID variable
IID <- c()
## create counter to accumulate through multiple for() loops
counter <- 1

## loop through rows object
for (a in 1:length(rows)) {
    ## loop through column object
    for (b in 1:length(cols)) {
      ##  combine row and column character
      IID[counter] <- paste0(rows[a],cols[b])
      ## add to counter
      counter <- counter + 1
  } # b loop
} # a loop

## add an additional barcode identifier to the "chipwell"
IID <- paste0(IID,'_1000',seq(1,length(IID),1))


## Now create a .fam file that PLINK would create from a VCF
## create zero vector the length of IID
zeros <- rep(0,length(IID))
## create -9 vector (PLINK recognizes -9 as missing) the length of IID
negnine <- rep(-9,length(IID))

## combine vectors into a data frame resembling a PLINK .fam file 
fam <- cbind.data.frame(zeros,IID,zeros,zeros,zeros,negnine)
## create default column names when R reads in a file with no header (V1, V2, etc..)
names(fam) <- paste0('V',seq(1,6,1))


## create phenotype file with linked IDs

chipwell_barcode <- IID
PT_ID <- paste0('PT-000',seq(1,length(IID),1))
SM_ID <- paste0('SM-000',seq(1,length(IID),1))
collaborator_sample_ID <- paste0('sample',seq(1,length(IID),1))

indx <- rbinom(length(IID),1,0.5)
disease <- rep('case',length(IID))
disease[indx==0] <- 'control'

indx <- rbinom(length(IID),1,0.5)
sex <- rep('male',length(IID))
sex[indx==0] <- 'female'

phe <- cbind.data.frame(PT_ID,SM_ID,collaborator_sample_ID,chipwell_barcode,disease,sex)


## write fam and phe data frames to file
write.table(fam,"test.fam",col=F,row=F,quo=F,sep='\t')
write.table(phe,"test.phe",col=T,row=F,quo=F,sep='\t')

## make a copy of original fam file using system() to run on command line
system('cp test.fam test.vcf_conversion.fam')

## read in fam and phe data frames
fam <- read.table('test.fam',stringsAsFactors = F)
phe <- read.table('test.phe',h=T,stringsAsFactors = F)

## Update fam file

## IMPORTANT: create order column to preserve order of .fam file
fam$order <- seq(1,nrow(fam),1)

## merge by chipwell_barcode column (V2 in .fam file, chipwell_barcode in .phe file)
mrg <- merge(fam,phe,by.x='V2',by.y='chipwell_barcode',all=T)

## here is a good spot to check if any IDs are present only in one file and not the other

## update sex (1=male; 2=female; other=unknown)
mrg$sex2 <- 0
mrg$sex2[mrg$sex=='male'] <- 1
mrg$sex2[mrg$sex=='female'] <- 2

## update affection status (0=missing,1=unaffected, 2=affected, -9=missing)
mrg$aff <- 0
mrg$aff[mrg$disease=='control'] <- 1
mrg$aff[mrg$disease=='case'] <- 2

## re-order with order() to maintain original .fam file order
mrg <- mrg[order(mrg$order),]

## select columns to create updated .fam file
fam2 <- mrg[,c('collaborator_sample_ID','collaborator_sample_ID','V3','V4','sex2','aff')]

## write updated .fam file
write.table(fam2,"test2.fam",col=F,row=F,quo=F,sep='\t')


## 2 - Manually updating .bim file

## GOAL:
## - find shared SNPs based on position
## - change SNP IDs to successfully merge data
## - write out shared SNP IDs

## On home computer:
setwd('/Users/daniel/partners_dropbox/pgc_stage1_analysis/howrigan/files')
## On Broad:
setwd('/web/personal/howrigan/tutorial')

## Load data.table package to read in larger files faster
## use install.packages(”data.table”) to install
library(data.table)

## Read in .map files to compare
kg <- fread('pop_4pop_mix_SEQ_hg19_first50k.bim')
ff <- fread('affy_exomechip_hg19_first5k.bim')

## create positional IDs (without SNP ID)
kga <- paste0(kg$V1,'_',kg$V4,'_',kg$V5,'_',kg$V6)

## Use both A1/A2 and A2/A1 IDs
ffa <- paste0(ff$V1,'_',ff$V4,'_',ff$V5,'_',ff$V6)
ffb <- paste0(ff$V1,'_',ff$V4,'_',ff$V6,'_',ff$V5)
## combine both IDs
ffc <- c(ffa,ffb)

## See how many positional matches we can find
sum(kga %in% ffa) ## 873
sum(kga %in% ffb) ## 49
sum(kga %in% ffc) ## 922

## add positional IDs to .bim files (faster than appending to data frame)
kg$kg_pos <- kga
ff$ff_pos1 <- ffa
ff$ff_pos2 <- ffb

## match positional IDs of 1KG to affy and provide rsID
indx1 <- kg$V2[match(ff$ff_pos1,kg$kg_pos)]
indx2 <- kg$V2[match(ff$ff_pos2,kg$kg_pos)]
## start all as '.'
ff$rsID <- '.'
## insert only non-NA values from indx to affy rsID
ff$rsID[!is.na(indx1)] <- indx1[!is.na(indx1)]
ff$rsID[!is.na(indx2)] <- indx2[!is.na(indx2)]

## create new .bim file
ff2 <- ff[,c('V1','rsID','V3','V4','V5','V6')]

head(ff2)

## create SNP Map Key
overlap_key <- subset(ff[,c('V2','rsID')],ff$rsID != '.')
head(overlap_key)

## Options moving forward:
## - --keep and --update-id in PLINK
## - overwrite .bim file directly