## PLINK manual editing tutorial script # manual editing: manipulating PLINK files outside of PLINK # - Best avoided if PLINK can run the operation ## Contents: ## 1 - Manually updating .fam file ## 2 - Manually updating .bim file ## Files available at: /web/personal/howrigan/tutorial ## 1 - Manually updating .fam file ## Creating a typical VCF ID from the genotyping platform ## creating hypothetical chipwell identifiers from a genotyping plate ## using seq() to create a sequence, sprintf() to have leading zeros, and paste0() to combine into a single character rows <- paste0('R',sprintf("%02d",seq(1,12,1))) cols <- paste0('C',sprintf("%02d",seq(1,8,1))) ## create empty individual ID variable IID <- c() ## create counter to accumulate through multiple for() loops counter <- 1 ## loop through rows object for (a in 1:length(rows)) { ## loop through column object for (b in 1:length(cols)) { ## combine row and column character IID[counter] <- paste0(rows[a],cols[b]) ## add to counter counter <- counter + 1 } # b loop } # a loop ## add an additional barcode identifier to the "chipwell" IID <- paste0(IID,'_1000',seq(1,length(IID),1)) ## Now create a .fam file that PLINK would create from a VCF ## create zero vector the length of IID zeros <- rep(0,length(IID)) ## create -9 vector (PLINK recognizes -9 as missing) the length of IID negnine <- rep(-9,length(IID)) ## combine vectors into a data frame resembling a PLINK .fam file fam <- cbind.data.frame(zeros,IID,zeros,zeros,zeros,negnine) ## create default column names when R reads in a file with no header (V1, V2, etc..) names(fam) <- paste0('V',seq(1,6,1)) ## create phenotype file with linked IDs chipwell_barcode <- IID PT_ID <- paste0('PT-000',seq(1,length(IID),1)) SM_ID <- paste0('SM-000',seq(1,length(IID),1)) collaborator_sample_ID <- paste0('sample',seq(1,length(IID),1)) indx <- rbinom(length(IID),1,0.5) disease <- rep('case',length(IID)) disease[indx==0] <- 'control' indx <- rbinom(length(IID),1,0.5) sex <- rep('male',length(IID)) sex[indx==0] <- 'female' phe <- cbind.data.frame(PT_ID,SM_ID,collaborator_sample_ID,chipwell_barcode,disease,sex) ## write fam and phe data frames to file write.table(fam,"test.fam",col=F,row=F,quo=F,sep='\t') write.table(phe,"test.phe",col=T,row=F,quo=F,sep='\t') ## make a copy of original fam file using system() to run on command line system('cp test.fam test.vcf_conversion.fam') ## read in fam and phe data frames fam <- read.table('test.fam',stringsAsFactors = F) phe <- read.table('test.phe',h=T,stringsAsFactors = F) ## Update fam file ## IMPORTANT: create order column to preserve order of .fam file fam$order <- seq(1,nrow(fam),1) ## merge by chipwell_barcode column (V2 in .fam file, chipwell_barcode in .phe file) mrg <- merge(fam,phe,by.x='V2',by.y='chipwell_barcode',all=T) ## here is a good spot to check if any IDs are present only in one file and not the other ## update sex (1=male; 2=female; other=unknown) mrg$sex2 <- 0 mrg$sex2[mrg$sex=='male'] <- 1 mrg$sex2[mrg$sex=='female'] <- 2 ## update affection status (0=missing,1=unaffected, 2=affected, -9=missing) mrg$aff <- 0 mrg$aff[mrg$disease=='control'] <- 1 mrg$aff[mrg$disease=='case'] <- 2 ## re-order with order() to maintain original .fam file order mrg <- mrg[order(mrg$order),] ## select columns to create updated .fam file fam2 <- mrg[,c('collaborator_sample_ID','collaborator_sample_ID','V3','V4','sex2','aff')] ## write updated .fam file write.table(fam2,"test2.fam",col=F,row=F,quo=F,sep='\t') ## 2 - Manually updating .bim file ## GOAL: ## - find shared SNPs based on position ## - change SNP IDs to successfully merge data ## - write out shared SNP IDs ## On home computer: setwd('/Users/daniel/partners_dropbox/pgc_stage1_analysis/howrigan/files') ## On Broad: setwd('/web/personal/howrigan/tutorial') ## Load data.table package to read in larger files faster ## use install.packages(”data.table”) to install library(data.table) ## Read in .map files to compare kg <- fread('pop_4pop_mix_SEQ_hg19_first50k.bim') ff <- fread('affy_exomechip_hg19_first5k.bim') ## create positional IDs (without SNP ID) kga <- paste0(kg$V1,'_',kg$V4,'_',kg$V5,'_',kg$V6) ## Use both A1/A2 and A2/A1 IDs ffa <- paste0(ff$V1,'_',ff$V4,'_',ff$V5,'_',ff$V6) ffb <- paste0(ff$V1,'_',ff$V4,'_',ff$V6,'_',ff$V5) ## combine both IDs ffc <- c(ffa,ffb) ## See how many positional matches we can find sum(kga %in% ffa) ## 873 sum(kga %in% ffb) ## 49 sum(kga %in% ffc) ## 922 ## add positional IDs to .bim files (faster than appending to data frame) kg$kg_pos <- kga ff$ff_pos1 <- ffa ff$ff_pos2 <- ffb ## match positional IDs of 1KG to affy and provide rsID indx1 <- kg$V2[match(ff$ff_pos1,kg$kg_pos)] indx2 <- kg$V2[match(ff$ff_pos2,kg$kg_pos)] ## start all as '.' ff$rsID <- '.' ## insert only non-NA values from indx to affy rsID ff$rsID[!is.na(indx1)] <- indx1[!is.na(indx1)] ff$rsID[!is.na(indx2)] <- indx2[!is.na(indx2)] ## create new .bim file ff2 <- ff[,c('V1','rsID','V3','V4','V5','V6')] head(ff2) ## create SNP Map Key overlap_key <- subset(ff[,c('V2','rsID')],ff$rsID != '.') head(overlap_key) ## Options moving forward: ## - --keep and --update-id in PLINK ## - overwrite .bim file directly