#!/usr/bin/perl use strict; ################################################### # # july 1st 2017: links updated # # this script was created to allow checking of overlap individuals of GWAS chips # without sharing of actual genotypes # the original script was written from Sarah Medland (ENIMGMA and Ben Neale (PGC) # after an idea from Stephan Ripke # see also http://www.ncbi.nlm.nih.gov/pubmed/22302573?dopt=Abstract # # for internal checks, see /home/radon01/sripke/bakker_ripke/hapmap_ref/cksum_files/0114a/README # ################################################ #################################### ## todos ## ## ## check if all SNPs are present in dataset -> done ## check if cksum is in path -> done ## ## check if SNP names can be recovered: e.g. exm-rs12182351, also newrs ## list the missing SNPs in a separate file if missing batches my $batcher_name = ""; my $ref_file = ""; #my $ref_file = "check_scz.bip.mdd.300K.migr.3ver.32eni.0314a.ref.bim"; my $batch_templ = ""; #my $batch_templ = "check_scz.bip.mdd.300K.migr.3ver.32eni.0314a.full_overlap.bXXX"; ## todo: maybe include dup check in positional translation, so that dups don't get translated ################################################### ### system call with test if successfull ################################################### sub mysystem(){ my ($systemstr)="@_"; system($systemstr); my $status = ($? >> 8); die "Error($status): system call failed:\n$systemstr\n" if ($status != 0); } ################################################### ### system call with test if successfull ################################################### sub mysystem_nodie(){ my ($systemstr)="@_"; system($systemstr); my $status = ($? >> 8); $status; } ########################################## # subroutine to split a plink-output-line with references ########################################## sub split_line_ref { my ($line)=${$_[0]}; chomp($line); $line =~ s/^[\s]+//g; my @cols= split /\s+/, $line; \@cols; } #my $sourcedir = ""; my $outname = ""; my $error = 0; my $nbatch = 10; use Getopt::Long; GetOptions( "bfile=s"=> \my $bfile, # "source=s"=> \$sourcedir, "name=s"=> \$outname, "keep=s"=> \my $keepf, "ploc=s"=> \my $ploc, "force"=> \my $force, "ref_file=s"=> \$ref_file, # "batch_templ=s"=> \$batch_templ, "batcher_name=s"=> \$batcher_name, # "nbatch=i"=> \$nbatch, "help"=> \my $help, ); if ($batcher_name eq "") { print "------------------------------------------------------------------------\n"; print "Error: option --batcher_name is mandatory\n"; print " check online resources or contact sripke\@broadinstitute.org\n"; print " see --help \n"; print "------------------------------------------------------------------------\n"; unless ($help) { die; } $error = 1; } my $batcher_filename = "$batcher_name.tar.gz"; my $plink_bin = "plink"; ### try to start plink #my $test_plink = "plink --noweb >/dev/null 2>&1"; my $test_plink = "plink --help >/dev/null 2>&1"; system($test_plink); my $status = ($? >> 8); if ($status != 0) { print "...... cannot find plink in path ($status), test options...\n"; my $test_plink = "$ploc/plink --help >/dev/null 2>&1"; system($test_plink); my $status = ($? >> 8); if ($status != 0) { print "------------------------------------------------------------------------\n"; print "Error: no plink binary found please give location via --ploc, see --help\n"; print " here is a plink binary at broad: --ploc /home/unix/sripke/plink_src/src/\n"; print " plink2 has been succesfully tested here: --ploc /home/unix/sripke/plink_src/plink_1.9_newest/\n"; print "------------------------------------------------------------------------\n"; unless ($help) { die; } $error = 1; } else { print "...... plink binary found in specified location\n"; if (-e "plink.log") { &mysystem("rm plink.log"); } $plink_bin = "$ploc/plink"; } } my $test_cksum = "echo l | cksum > /dev/null 2>&1"; system($test_cksum); my $status = ($? >> 8); if ($status != 0) { print "------------------------------------------------------------------------\n"; print "Error: program not found\n"; print " this is highly unusual on Unix platforms, please contact sripke\@broadinstitute.org \n"; print " see --help \n"; print "------------------------------------------------------------------------\n"; unless ($help) { die; } $error = 1; } my $test_wget = "wget --help > /dev/null 2>&1"; system($test_wget); my $status = ($? >> 8); if ($status != 0) { print "------------------------------------------------------------------------\n"; print "Error: program not found\n"; print " this is highly unusual on Unix platforms, please contact sripke\@broadinstitute.org \n"; print " see --help \n"; print "------------------------------------------------------------------------\n"; unless ($help) { die; } $error = 1; } my $test_tar = "tar --help > /dev/null 2>&1"; system($test_tar); my $status = ($? >> 8); if ($status != 0) { print "------------------------------------------------------------------------\n"; print "Error: program not found\n"; print " this is highly unusual on Unix platforms, plesase contact sripke\@broadinstitute.org \n"; print " see --help \n"; print "------------------------------------------------------------------------\n"; unless ($help) { die; } $error = 1; } #print "deb\n"; #exit; if ($help || $error){ print " ###################################################################################### # # id_geno_checksum # # this script allows checking of individuals of GWAS chips without sharing of actual genotypes # the precursor shell script was written from Sarah Medland (ENIMGMA) and Ben Neale (PGC) # after an idea from Stephan Ripke # # this perl script has been written by Stephan Ripke, # many thanks to Menachem Fromer for intellectual help # # it uses 10 btaches with 50 SNPs each, which means 3^50 = 7.1e23 different outcomes with genotypes 0,1,2 at each SNP # the somehow outdated cksum command is used on purpose since it restricts to 32 bit differnt cksums: 2^32 = 4.3e10 # this means for each checksum there are ~1.0e23 differnt genotype configurations, so the cksum does not provide a single key to the genotype configuration # still the number of different cksums is high enough that a random false positive overlap seems unlikely # # the script takes care of: # - strandflips # - LD duplication # - positional duplication # # as a published alternative consider also: # see also http://www.ncbi.nlm.nih.gov/pubmed/22302573?dopt=Abstract ###################################################################################### usage: $0 --bfile BFILE options: mandatory: --bfile STRING plink binary to check (root without bed/bim/fam) optional --source STRING directory for the location of the helper-files, default is current working directory if the files are not found they will be downloaded from the web --name STRING name for this run of checksums, be default the bfile --keep STRING specify keep file for individuals to keep --ploc STRING directory that contains plink-binary on broad: /home/unix/sripke/plink_src/src/ script has been successfully tested with plink2 (https://www.cog-genomics.org/plink2/) on broad: /home/unix/sripke/plink_src/plink_1.9_newest/ --help this help message options for other batch-versions (pro-use): --ref_file STRING reference file, default: $ref_file --batch_templ STRING batch_template, default: $batch_templ --batcher_name STRING batcher name, default: $batcher_name --nbatch INT number of batches, default: $nbatch for download of helper files use these two commands (will be done automatically by the script) - wget https://personal.broadinstitute.org/sripke/share_links/checksums_download/$batcher_filename - tar -xvf $batcher_filename --------------------------------------------------- created by Stephan Ripke 2014 at MGH, Boston, MA feel free to contact me: sripke (at) broadinstitute (dot) org -------------------------------------------------- \n"; exit 2; } use Cwd; use File::Path; print "checking presence of all necessary files\n"; unless ($bfile) { print "------------------------------------------------------------------------\n"; print "Error: no plink dataset specified, see --help\n"; print "------------------------------------------------------------------------\n"; die; } unless (-e "$bfile.bed" || -e "$bfile.bim" || -e "$bfile.fam") { print "------------------------------------------------------------------------\n"; print "Error: plink dataset (bed/bim/fam) not found, see --help\n"; print "------------------------------------------------------------------------\n"; die; } my $rootdir = &Cwd::cwd(); #if ($sourcedir eq "") { # $sourcedir = "$rootdir"; #} ################################################## ### create refdir #################################################### my $refdir = "$rootdir/ref.$batcher_name"; unless (-d $refdir) { print "------------------------------\n"; print "refdir not existing: $refdir\n"; print "--- assuming reference data ($batcher_filename) being here: https://personal.broadinstitute.org/sripke/share_links/checksums_download/$batcher_filename\n"; print "--- please check in your browser, that this is indeed the case\n"; print "--- do you want the subdirectory created and the reference files being downloaded (Y/N)? ---\n"; # print ""; while (1) { my $answer = lc <>; chomp $answer; if ($answer eq "y") { print "ok, will do\n"; my @created = mkpath( ## $created ? "$refdir", {verbose => 0, mode => 0750}, ); chdir ($refdir); &mysystem ("wget https://personal.broadinstitute.org/sripke/share_links/checksums_download/$batcher_filename "); &mysystem ("tar -xvf $batcher_filename "); chdir ($rootdir); last; } else { print "exiting now...\n"; exit; } } # print "debug stop\n"; # exit; } ## read naming file die "$refdir/cs.names: ".$! unless open NA, "< $refdir/cs.names"; my %names_hash; while (my $line = ){ my @cells = @{&split_line_ref(\$line)}; $names_hash{$cells[0]} = $cells[1]; } close NA; $batch_templ = $names_hash{"bat"}.".bXXX"; $nbatch = $names_hash{"nba"}; $ref_file = $names_hash{"ref"}; my $du_file = "$refdir/".$names_hash{"dup"}; #my $batch_templ = "XXX_SNPs"; foreach my $i (0..$nbatch-1) { my $batch_file = "$refdir/$batch_templ"; my $i_str = sprintf "%02d",$i; $batch_file =~ s/XXX/$i_str/; unless (-e "$batch_file") { print "------------------------------------------------------------------------\n"; print "$batch_file not found, see --help for download instructions\n"; print " - wget https://personal.broadinstitute.org/sripke/share_links/checksums_download/$batcher_filename \n"; print " - tar -xvf $batcher_filename \n"; print " - then rerun \n"; print "------------------------------------------------------------------------\n"; die; } # unless (-e "$sourcedir/$i"."_Alleles") { # print "------------------------------------------------------------------------\n"; # print "$sourcedir/$i"."_Alleles not found, see --help for download instructions\n"; # print "------------------------------------------------------------------------\n"; # die; # } } #unless (-e "$sourcedir/SNP.list") { # print "------------------------------------------------------------------------\n"; # print "$sourcedir/SNP.list not found, see --help for download instructions\n"; # print "------------------------------------------------------------------------\n"; # die; #} my $ra_file = "$refdir/$ref_file.recode_allele"; unless (-e "$ra_file") { print "------------------------------------------------------------------------\n"; print "Error: $ra_file not found, see --help for download instructions\n"; print "------------------------------------------------------------------------\n"; die; } my $ua_file = "$refdir/$ref_file.update_alleles"; unless (-e "$ua_file") { print "------------------------------------------------------------------------\n"; print "Error: $ua_file not found, see --help for download instructions\n"; print "------------------------------------------------------------------------\n"; die; } #my $du_file = "$refdir/$ref_file"; #$du_file =~ s/.ref.bim$//; #$du_file .= ".dup.short"; unless (-e "$du_file") { print "------------------------------------------------------------------------\n"; print "Error: $du_file not found, see --help for download instructions\n"; print "------------------------------------------------------------------------\n"; die; } my $bfile_short = $bfile; $bfile_short =~ s!.*/!!; if ($bfile ne $bfile_short) { print "------------------------------------------------------------------------\n"; print "Error: --bfile points to different directory, please make sure it is \n"; print " present in current directory (symbolic link is fine)\n"; print "------------------------------------------------------------------------\n"; die; } if ($outname eq "") { $outname = $bfile; } my $workdir = "checkSum_$outname"; while (-e $workdir) { print "-----------------------------------------------------------------------------------------\n"; print "Warning: directory $workdir is already existing, trying extended name $workdir.c, see --help\n"; print "-----------------------------------------------------------------------------------------\n"; $workdir .= ".c"; # unless ($force) { # die; # } } while (-e "CheckSum.$outname.cs") { print "-----------------------------------------------------------------------------------------\n"; print "Warning: result file CheckSum.$outname.cs is already existing, trying extended outname $outname.o\n"; print "-----------------------------------------------------------------------------------------\n"; $outname .= ".o"; # die; } my @created = mkpath( ## $created ? "$workdir", {verbose => 0, mode => 0750}, ); ################################################## ### read snp list into hash #################################################### my $str = 0; my $str_0 = 0; my $ntot = 0; my $nright = 0; #my $sin = 0; if (-e "CheckSum.$bfile.cs") { print "output (CheckSum.$bfile.cs) already existing\n"; print "exit\n"; exit; } unless (-e "$workdir/$bfile.bim.potr") { print "testing dataset format...\n"; ## read ref file die "$refdir/$ref_file: ".$! unless open RF, "< $refdir/$ref_file"; my %ref_hash; my %pos_hash; while (my $line = ){ my @cells = @{&split_line_ref(\$line)}; my $pos = $cells[0]."_".$cells[3]; $pos_hash{$pos} = $cells[1]; $ref_hash{$cells[1]} = 1; } close RF; my %bim_hash; my @bim_arr; die "$bfile.bim: ".$! unless open BIM, "< $bfile.bim"; while (my $line = ){ my @cells = @{&split_line_ref(\$line)}; $bim_hash{$cells[1]} = 1; push @bim_arr, $line; } close BIM; ### pos trans of bim-file #die "$bfile.bim: ".$! unless open BIM, "< $bfile.bim"; die "$workdir/$bfile.bim.potr: ".$! unless open PO, "> $workdir/$bfile.bim.potr"; die "$workdir/$bfile.bim.potr: ".$! unless open PI, "> $workdir/$bfile.bim.potr.info"; print PI "orig\tref\tpos\ttr\n"; foreach my $line (@bim_arr) { #while (my $line = ){ my @cells = @{&split_line_ref(\$line)}; my $pos = $cells[0]."_".$cells[3]; $ntot++; if (exists $pos_hash{$pos}) { if ($pos_hash{$pos} ne $cells[1]) { unless (exists $bim_hash{$pos_hash{$pos}}) { ## test if translated already exists print PI "$cells[1]\t$pos_hash{$pos}\t$pos\t1\n"; $cells[1] = $pos_hash{$pos}; $str++; $bim_hash{$cells[1]} = 1; } else { print PI "$cells[1]\t$pos_hash{$pos}\t$pos\t0\n"; $str_0++; } } else { $nright++; } } # if (exists $ref_hash{$cells[1]}) {# # $sin++; # } print PO "@cells\n"; } #close BIM; close PO; close PI; print "positional translated $str SNPs\n"; ### into log-file print "positional not_translated $str_0 SNPs\n"; ### into log-file print "$nright SNPs with matching position\n"; ### into log-file print "out of a total of $ntot SNPs\n"; ### into log-file } my $take_potr = 0; if ($nright > $ntot/2) { print "keep positional translated SNPs\n"; $take_potr = 1; } else { print "will not keep positional translated SNPs, positions are not trustworthy\n"; } #exit; ## read recode_reference die "$ra_file: ".$! unless open RA, "< $ra_file"; my %recode_hash; while (my $line = ){ my @cells = @{&split_line_ref(\$line)}; $recode_hash{$cells[0]} = $cells[1]; } close RA; ## test how many SNPs are found in bim_file my $sin = 0; die "$workdir/$bfile.bim.potr: ".$! unless open BIM, "< $workdir/$bfile.bim.potr"; while (my $line = ){ my @cells = @{&split_line_ref(\$line)}; if (exists $recode_hash{$cells[1]}) { $sin++; } } close BIM; if ($sin < 10) { print "-----------------------------------------------------------------------------------------------\n"; print "Error: less than 10 SNPs found being overlapping with dataset, please make sure:\n"; print "1) snp names: rs-names\n"; print "2) whole genome data required, this script will not work on a handful of SNPs\n"; print "-----------------------------------------------------------------------------------------------\n"; exit; } print "Extracting batch SNPs....\n..."; #my $plinksys = "$plink_bin --noweb --bfile $bfile --extract $ra_file --make-bed --out $workdir/snps_for_checksum.$outname"; my $bimfile = "$workdir/$bfile.bim.potr"; if ($take_potr == 0) { $bimfile = "$bfile.bim"; } my $plinksys = "$plink_bin --silent --noweb --bed $bfile.bed --fam $bfile.fam --bim $bimfile --extract $ra_file --make-bed --out $workdir/snps_for_checksum.$outname"; if ($keepf) { $plinksys .= " --keep $keepf"; } unless (-e "$workdir/snps_for_checksum.$outname.bed") { my $status = &mysystem_nodie($plinksys); if ($status !=0) { print "-----------------------------------------------------------------------------------------\n"; print "Error: this plink command failed: $plinksys\n\n"; print " the Error message above the dotted line does come from plink and does not suggest a change to id_genohecksum\n"; print " some suggestions:\n"; print " maybe you should work on a machine with more RAM, e.g. ish on Broad?\n"; print " as an alternative try using plink2 \n"; print " (on Broad: --ploc /home/unix/sripke/plink_src/plink_1.9_newest/)\n"; print "------------------------------------------------------------------------------------------\n"; print "removing workdir $workdir\n"; &mysystem ("rm -r $workdir"); exit; } } print "...done\n"; #exit; print "Align batch SNPs....\n..."; my $plinksys = "$plink_bin --silent --noweb --bfile $workdir/snps_for_checksum.$outname --update-alleles $ua_file --make-bed --out $workdir/snps_for_checksum.$outname.sa"; unless (-e "$workdir/snps_for_checksum.$outname.sa.bed") { &mysystem($plinksys); } #else { # print "existing: $workdir/snps_for_checksum.$outname.sa.bed\n"; #} print "...done\n"; #exit; ########################################################### ### check if recode-allele is present in all SNPs (flips before alignment) ######################################################### my $preflips = 0; die "$workdir/snps_for_checksum.$outname.bim: ".$! unless open BIM, "< $workdir/snps_for_checksum.$outname.bim"; while (my $line = ){ my @cells = @{&split_line_ref(\$line)}; if ($cells[4] ne $recode_hash{$cells[1]} && $cells[5] ne $recode_hash{$cells[1]}) { $preflips++; } } close BIM; #print "$preflips preflips\n"; #exit; ########################################################### ### check if recode-allele is present in all SNPs (no flips left) after alignment ######################################################### my $strandflips = 0; my $strand_ex = "SNP\tA1\tA2\tref_Allele\n"; die "$workdir/snps_for_checksum.$outname.sa.bim: ".$! unless open BIM, "< $workdir/snps_for_checksum.$outname.sa.bim"; while (my $line = ){ my @cells = @{&split_line_ref(\$line)}; if ($cells[4] ne $recode_hash{$cells[1]} && $cells[5] ne $recode_hash{$cells[1]}) { $strandflips++; if ($strandflips < 6) { $strand_ex .= "$cells[1]\t$cells[4]\t$cells[5]\t$recode_hash{$cells[1]}\n"; } } } close BIM; if ($strandflips > 0) { print "-----------------------------------------------------------------------------------------------------------\n"; print "Error: $strandflips SNPs have mismatching allele names (e.g. CheckSum.$outname.cs.error), please make sure:\n"; print "1) alleles being A,C,G,T\n"; print "-----------------------------------------------------------------------------------------------------------\n"; die "CheckSum.$outname.cs.error: ".$! unless open ERR, "> CheckSum.$outname.cs.error"; print ERR "###### only listing the first 6 entries #########\n"; print ERR "$strand_ex\n"; close ERR; die; } else { if (-e "CheckSum.$outname.cs.error") { &mysystem ("rm CheckSum.$outname.cs.error"); } } ##################################################################### ### check SNP missingness in resulting dataset #################################################################### my $plinksys = "$plink_bin --silent --noweb --bfile $workdir/snps_for_checksum.$outname.sa --out $workdir/snps_for_checksum.$outname.sa.miss --missing"; unless (-e "$workdir/snps_for_checksum.$outname.sa.miss.lmiss") { &mysystem($plinksys); } my $missn = 0; die "$workdir/snps_for_checksum.$outname.sa.miss.lmiss: ".$! unless open MI, "< $workdir/snps_for_checksum.$outname.sa.miss.lmiss"; die "$workdir/snps_for_checksum.$outname.sa.miss.lmiss.in: ".$! unless open MO, "> $workdir/snps_for_checksum.$outname.sa.miss.lmiss.in"; while (my $line = ){ my @cells = @{&split_line_ref(\$line)}; if ($cells[4] < 0.1) { print MO "$cells[1]\n"; $missn++; } } close MI; close MO; my $plinksys = "$plink_bin --silent --noweb --bfile $workdir/snps_for_checksum.$outname.sa --extract $workdir/snps_for_checksum.$outname.sa.miss.lmiss.in --make-bed --out $workdir/snps_for_checksum.$outname.sa.qc"; unless (-e "$workdir/snps_for_checksum.$outname.sa.qc.bed") { &mysystem($plinksys); } ##################################################################### ### translate with strong LD friends (LD-R2 =1.0 over all 1KG pops) #################################################################### ### read dup file die "$du_file: ".$! unless open DU, "< $du_file"; my %du_hash; while (my $line = ){ my @cells = @{&split_line_ref(\$line)}; my $haps = $cells[4]; my $a1 = substr $haps,0,1; my $ta1 = substr $haps,1,1; my $a2 = substr $haps,3,1; my $ta2 = substr $haps,4,1; my $outline = "$cells[2]\t$ta1\t$a1\t$ta2\t$a2\n"; $du_hash{$cells[3]} = $outline; # print $cells[3]." ".$outline; } close DU; my %bimsa_hash; die "$workdir/snps_for_checksum.$outname.sa.qc.bim: ".$! unless open BSA, "< $workdir/snps_for_checksum.$outname.sa.qc.bim"; while (my $line = ){ my @cells = @{&split_line_ref(\$line)}; $bimsa_hash{$cells[1]} = 1; } close BSA; my $dupn = 0; ### translate bim file die "$workdir/snps_for_checksum.$outname.sa.qc.bim: ".$! unless open BI, "< $workdir/snps_for_checksum.$outname.sa.qc.bim"; die "$workdir/snps_for_checksum.$outname.sa.qc.dt.bim: ".$! unless open BO, "> $workdir/snps_for_checksum.$outname.sa.qc.dt.bim"; die "$workdir/snps_for_checksum.$outname.sa.qc.dt.bim.warnings: ".$! unless open BOW, "> $workdir/snps_for_checksum.$outname.sa.qc.dt.bim.warnings"; while (my $line = ){ my @cells = @{&split_line_ref(\$line)}; if (exists $du_hash{$cells[1]}) { # print "would dup-translate $cells[1]\n"; # print "$line"; my @tce = @{&split_line_ref(\$du_hash{$cells[1]})}; my %tal = (); $tal{$tce[1]} = $tce[2]; $tal{$tce[3]} = $tce[4]; unless (exists $tal{$cells[4]}) { print "--------------------------------------------------------------\n"; print "Error: alleles $cells[1] and $tce[0] do not match with dup-file\n"; exit; } unless (exists $tal{$cells[5]}) { print "--------------------------------------------------------------\n"; print "Error: alleles $cells[1] and $tce[0] do not match with dup-file\n"; exit; } if (exists $bimsa_hash{$tce[0]}) { # print "--------------------------------------------------------------\n"; print BOW "Warning: target SNP $tce[0], $cells[1] in dup translation is already present in dataset\n"; # exit; } else { $cells[1] = $tce[0]; $cells[4] = $tal{$cells[4]}; $cells[5] = $tal{$cells[5]}; $bimsa_hash{$cells[1]} = 1; } # print "@cells\n"; $dupn++; } print BO "@cells\n"; } close BI; close BO; close BOW; print "translated $dupn SNPs with strong LD friends\n"; #exit; #exit; print "looks good, starting with checksums now.....\n"; #exit; ########################################################### ### creating checksums ######################################################### print "...working on the checksums\n"; my %id_cs; my $failed_batches = 0; my %fb ; foreach my $i (0..$nbatch-1) { my $batch_file = "$refdir/$batch_templ"; my $i_str = sprintf "%02d",$i; $batch_file =~ s/XXX/$i_str/; #### extract subset my $plinksys = "$plink_bin --silent --noweb --bed $workdir/snps_for_checksum.$outname.sa.qc.bed --fam $workdir/snps_for_checksum.$outname.sa.qc.fam --bim $workdir/snps_for_checksum.$outname.sa.qc.dt.bim --recodeA --extract $batch_file --recode-allele $ra_file --out $workdir/CheckSum.$i.$outname.sa"; unless (-e "$workdir/CheckSum.$i.$outname.sa.raw") { &mysystem($plinksys); } my $i1 = $i + 1; print "--------------------------------------------------------------------------\n"; print "read $workdir/CheckSum.$i.$outname.sa.raw: batch $i1 out of $nbatch\n"; print "--------------------------------------------------------------------------\n"; die "$workdir/CheckSum.$i.$outname.raw: ".$! unless open RAW, "< $workdir/CheckSum.$i.$outname.sa.raw"; my $ll=0; my $head = ; while (my $line = ){ my $geno_str = ""; my @cells = @{&split_line_ref(\$line)}; my $cc = 0; foreach my $g (6..$#cells) { if ($cells[$g] eq "NA") { $cells[$g] = 9; } $geno_str .= "$cells[$g]"; $cc++; } my $cks = `echo $geno_str | cksum`; if ($cc != 50) { $cks = "---"; unless (exists $fb{$i}){ $failed_batches++; } $fb{$i} = 1; } my @cells_cs = @{&split_line_ref(\$cks)}; my $id_tmp = $cells[0]." ". $cells[1]; $id_cs{$id_tmp} .= " ".$cells_cs[0]; $ll++; } close RAW; } die "$workdir/CheckSum.$outname.cs: ".$! unless open CS, "> $workdir/CheckSum.$outname.cs"; foreach my $k (keys %id_cs) { print CS $k.$id_cs{$k}."\n"; } close CS; die "CheckSum.$outname.cs.log: ".$! unless open LOG, "> CheckSum.$outname.cs.log"; print LOG "successfully created checksums for $nbatch batches with 50 SNPs each\n"; print LOG "here some numbers\n"; print LOG "------------------\n"; print LOG "\nnumber of failed batches (not all SNPs found): $failed_batches\n"; print LOG "\nnumber of strand flipped aligned SNPs: $preflips\n"; print LOG "\nnumber of LD translated SNPs: $dupn\n"; print LOG "\nnumber of positional translated SNPs: $str\n"; print LOG "\nnumber of positional non-translated SNPs (position already taken): $str_0\n"; print LOG "\nthese files were used: $batch_templ\n"; close LOG; &mysystem ("cp $workdir/CheckSum.$outname.cs ."); print "###################################################################################\n"; print "#################### SUCCESS ###############################################\n"; print "###################################################################################\n"; print "## please share file CheckSum.$outname.cs with your collaborator \n"; print "## have a look at CheckSum.$outname.cs.log for more details \n"; print "## use id_geno_checktest for convenient overlap test / summary \n"; print "###################################################################################\n"; exit;