#!/usr/bin/perl
use strict;


###################################################
#
# july 1st 2017: links updated
#
# this script was created to allow checking of overlap individuals of GWAS chips
#  without sharing of actual genotypes
#    the original script was written from Sarah Medland (ENIMGMA and Ben Neale (PGC)
#       after an idea from Stephan Ripke 
#         see also http://www.ncbi.nlm.nih.gov/pubmed/22302573?dopt=Abstract
#
# for internal checks, see /home/radon01/sripke/bakker_ripke/hapmap_ref/cksum_files/0114a/README          
#
################################################

####################################
## todos
##
##
##  check if all SNPs are present in dataset -> done
##  check if cksum is in path -> done
##
##  check if SNP names can be recovered: e.g. exm-rs12182351, also newrs
##  list the missing SNPs in a separate file if missing batches
 
my $batcher_name = "";


my $ref_file = "";
#my $ref_file = "check_scz.bip.mdd.300K.migr.3ver.32eni.0314a.ref.bim";
my $batch_templ =  "";
#my $batch_templ =  "check_scz.bip.mdd.300K.migr.3ver.32eni.0314a.full_overlap.bXXX";


## todo: maybe include dup check in positional translation, so that dups don't get translated


###################################################
###  system call with test if successfull
###################################################

sub mysystem(){
    my ($systemstr)="@_";
    system($systemstr);
    my $status = ($? >> 8);
    die "Error($status): system call failed:\n$systemstr\n" if ($status != 0);
}


###################################################
###  system call with test if successfull
###################################################

sub mysystem_nodie(){
    my ($systemstr)="@_";
    system($systemstr);
    my $status = ($? >> 8);
    $status;

}


##########################################
# subroutine to split a plink-output-line with references
##########################################

sub split_line_ref {
    my ($line)=${$_[0]};
    chomp($line);
    $line =~ s/^[\s]+//g;
    my @cols=  split /\s+/, $line;
    \@cols;
}


#my $sourcedir = "";
my $outname = "";
my $error = 0;
my $nbatch = 10;

use Getopt::Long;
GetOptions( 
    "bfile=s"=> \my $bfile,
#    "source=s"=> \$sourcedir,
    "name=s"=> \$outname,
    "keep=s"=> \my $keepf,
    "ploc=s"=> \my $ploc,
    "force"=> \my $force,

    "ref_file=s"=> \$ref_file,
#    "batch_templ=s"=> \$batch_templ,
    "batcher_name=s"=> \$batcher_name,
#    "nbatch=i"=> \$nbatch,
 
    "help"=> \my $help,
    );


if ($batcher_name eq "") {

    print "------------------------------------------------------------------------\n";
    print "Error: option --batcher_name is mandatory\n";
    print "    check online resources or contact sripke\@broadinstitute.org\n";
    print "    see --help \n";
    print "------------------------------------------------------------------------\n";
    unless ($help) {
	die;
    }
    $error = 1;
    
}


my $batcher_filename = "$batcher_name.tar.gz";

my $plink_bin = "plink";
### try to start plink
#my $test_plink = "plink --noweb >/dev/null 2>&1";
my $test_plink = "plink --help >/dev/null 2>&1";
system($test_plink);
my $status = ($? >> 8);


if ($status != 0) {
    print "...... cannot find plink in path ($status), test options...\n";

    my $test_plink = "$ploc/plink --help >/dev/null 2>&1";
    system($test_plink);
    my $status = ($? >> 8);
    if ($status != 0) {
	print "------------------------------------------------------------------------\n";
	print "Error: no plink binary found please give location via --ploc, see --help\n";
    	print "    here is a plink binary at broad: --ploc /home/unix/sripke/plink_src/src/\n";
    	print "    plink2 has been succesfully tested here: --ploc /home/unix/sripke/plink_src/plink_1.9_newest/\n";
	print "------------------------------------------------------------------------\n";
	unless ($help) {
	    die;
	}
	$error = 1;
    }
    else {
	print "...... plink binary found in specified location\n";
	if (-e "plink.log") {
	    &mysystem("rm plink.log");
	}
	$plink_bin = "$ploc/plink";
    }
    
}


my $test_cksum = "echo l | cksum > /dev/null 2>&1";
system($test_cksum);
my $status = ($? >> 8);


if ($status != 0) {

    print "------------------------------------------------------------------------\n";
    print "Error: program <cksum> not found\n";
    print "    this is highly unusual on Unix platforms, please contact sripke\@broadinstitute.org \n";
    print "    see --help \n";
    print "------------------------------------------------------------------------\n";
    unless ($help) {
	die;
    }
    $error = 1;
    
}


my $test_wget = "wget --help > /dev/null 2>&1";
system($test_wget);
my $status = ($? >> 8);


if ($status != 0) {

    print "------------------------------------------------------------------------\n";
    print "Error: program <wget> not found\n";
    print "    this is highly unusual on Unix platforms, please contact sripke\@broadinstitute.org \n";
    print "    see --help \n";
    print "------------------------------------------------------------------------\n";
    unless ($help) {
	die;
    }
    $error = 1;
    
}

my $test_tar = "tar --help > /dev/null 2>&1";
system($test_tar);
my $status = ($? >> 8);


if ($status != 0) {

    print "------------------------------------------------------------------------\n";
    print "Error: program <tar> not found\n";
    print "    this is highly unusual on Unix platforms, plesase contact sripke\@broadinstitute.org \n";
    print "    see --help \n";
    print "------------------------------------------------------------------------\n";
    unless ($help) {
	die;
    }
    $error = 1;
    
}
#print "deb\n";
#exit;


if ($help || $error){
    print "

######################################################################################
#
#  id_geno_checksum
#
# this script allows checking of individuals of GWAS chips without sharing of actual genotypes
#    the precursor shell script was written from Sarah Medland (ENIMGMA) and Ben Neale (PGC)
#    after an idea from Stephan Ripke 
#
#    this perl script has been written by Stephan Ripke,
#            many thanks to Menachem Fromer for intellectual help
#
#  it uses 10 btaches with 50 SNPs each, which means 3^50 = 7.1e23 different outcomes with genotypes 0,1,2 at each SNP
#  the somehow outdated cksum command is used on purpose since it restricts to 32 bit differnt cksums: 2^32 = 4.3e10
#  this means for each checksum there are ~1.0e23 differnt genotype configurations, so the cksum does not provide a single key to the genotype configuration
#  still the number of different cksums is high enough that a random false positive overlap seems unlikely
#
# the script takes care of:
#     - strandflips
#     - LD duplication
#     - positional duplication
#
#   as a published alternative consider also:
#     see also http://www.ncbi.nlm.nih.gov/pubmed/22302573?dopt=Abstract
######################################################################################

  usage: $0 --bfile BFILE

      options:
       
       mandatory:
        --bfile STRING   plink binary to check (root without bed/bim/fam)


       optional
        --source STRING directory for the location of the helper-files,
                            default is current working directory
                            if the files are not found they will be downloaded from the web
        --name STRING   name for this run of checksums, be default the bfile
        --keep STRING   specify keep file for individuals to keep
        --ploc STRING   directory that contains plink-binary
                           on broad: /home/unix/sripke/plink_src/src/
                        script has been successfully tested with plink2 
                           (https://www.cog-genomics.org/plink2/)
                           on broad: /home/unix/sripke/plink_src/plink_1.9_newest/

        --help          this help message


       options for other batch-versions (pro-use):
        --ref_file STRING       reference file, default: $ref_file
        --batch_templ STRING    batch_template, default: $batch_templ
        --batcher_name STRING   batcher name, default: $batcher_name
        --nbatch INT            number of batches, default: $nbatch
         

 for download of helper files use these two commands (will be done automatically by the script)

   - wget https://personal.broadinstitute.org/sripke/share_links/checksums_download/$batcher_filename 
   - tar -xvf $batcher_filename 

---------------------------------------------------
 created by Stephan Ripke 2014 at MGH, Boston, MA
 feel free to contact me:
    sripke (at) broadinstitute (dot) org
--------------------------------------------------
 \n";
    exit 2;
}


use Cwd;
use File::Path;


print "checking presence of all necessary files\n";
unless ($bfile) {
    print "------------------------------------------------------------------------\n";
    print "Error: no plink dataset specified, see --help\n";
    print "------------------------------------------------------------------------\n";
    die;
}

unless (-e "$bfile.bed" || -e "$bfile.bim" || -e "$bfile.fam") {
    print "------------------------------------------------------------------------\n";
    print "Error: plink dataset (bed/bim/fam) not found, see --help\n";
    print "------------------------------------------------------------------------\n";
    die;
}

my $rootdir = &Cwd::cwd();

#if ($sourcedir eq "") {
#    $sourcedir = "$rootdir";
#}


##################################################
### create refdir
####################################################


my $refdir = "$rootdir/ref.$batcher_name";


unless (-d $refdir) {
    print "------------------------------\n";
    print "refdir not existing: $refdir\n";
    print "--- assuming reference data ($batcher_filename) being here: https://personal.broadinstitute.org/sripke/share_links/checksums_download/$batcher_filename\n";
    print "--- please check in your browser, that this is indeed the case\n";
    print "--- do you want the subdirectory created and the reference files being downloaded (Y/N)? ---\n";
#    print "";


    while (1) {
	my $answer = lc <>;
	chomp $answer;
	if ($answer eq "y") {

	    print "ok, will do\n";
	    my @created = mkpath(   ## $created ?
				    "$refdir",
				    {verbose => 0, mode => 0750},
		);
	    
	    chdir ($refdir);
	    &mysystem ("wget https://personal.broadinstitute.org/sripke/share_links/checksums_download/$batcher_filename ");
	    &mysystem ("tar -xvf $batcher_filename ");
	    chdir ($rootdir);
	    last;
	}
	else {
	    print "exiting now...\n";
	    exit;
	}
    }

#    print "debug stop\n";
#    exit;

}

## read naming file
die "$refdir/cs.names: ".$! unless open NA, "< $refdir/cs.names";
my %names_hash;
while (my $line = <NA>){
    my @cells = @{&split_line_ref(\$line)};
    $names_hash{$cells[0]} = $cells[1];
}
close NA;


$batch_templ = $names_hash{"bat"}.".bXXX";
$nbatch = $names_hash{"nba"};
$ref_file = $names_hash{"ref"};
my $du_file = "$refdir/".$names_hash{"dup"};


#my $batch_templ =  "XXX_SNPs";

foreach my $i (0..$nbatch-1) {
    my $batch_file = "$refdir/$batch_templ";
    my $i_str = sprintf "%02d",$i;
    $batch_file =~ s/XXX/$i_str/;
    unless (-e "$batch_file") {
	print "------------------------------------------------------------------------\n";
	print "$batch_file not found, see --help for download instructions\n";


	print " - wget https://personal.broadinstitute.org/sripke/share_links/checksums_download/$batcher_filename \n";
	print " - tar -xvf $batcher_filename \n";
	print " - then rerun \n";


	print "------------------------------------------------------------------------\n";
	die;
    }
#    unless (-e "$sourcedir/$i"."_Alleles") {
#	print "------------------------------------------------------------------------\n";
#	print "$sourcedir/$i"."_Alleles not found, see --help for download instructions\n";
#	print "------------------------------------------------------------------------\n";
#	die;
#   }
}


#unless (-e "$sourcedir/SNP.list") {
#    print "------------------------------------------------------------------------\n";
#    print "$sourcedir/SNP.list not found, see --help for download instructions\n";
#    print "------------------------------------------------------------------------\n";
#    die;
#}


my $ra_file = "$refdir/$ref_file.recode_allele";

unless (-e "$ra_file") {
    print "------------------------------------------------------------------------\n";
    print "Error: $ra_file not found, see --help for download instructions\n";
    print "------------------------------------------------------------------------\n";
    die;
}


my $ua_file = "$refdir/$ref_file.update_alleles";

unless (-e "$ua_file") {
    print "------------------------------------------------------------------------\n";
    print "Error: $ua_file not found, see --help for download instructions\n";
    print "------------------------------------------------------------------------\n";
    die;
}

#my $du_file = "$refdir/$ref_file";
#$du_file =~ s/.ref.bim$//;
#$du_file .= ".dup.short";


unless (-e "$du_file") {
    print "------------------------------------------------------------------------\n";
    print "Error: $du_file not found, see --help for download instructions\n";
    print "------------------------------------------------------------------------\n";
    die;
}


my $bfile_short = $bfile;
$bfile_short =~ s!.*/!!;
if ($bfile ne $bfile_short) {
    print "------------------------------------------------------------------------\n";
    print "Error: --bfile points to different directory, please make sure it is \n";
    print "                present in current directory (symbolic link is fine)\n";
    print "------------------------------------------------------------------------\n";
    die;
}

if ($outname eq "") {
    $outname = $bfile;
}


my $workdir = "checkSum_$outname";

while (-e $workdir) {
    print "-----------------------------------------------------------------------------------------\n";
    print "Warning: directory $workdir is already existing, trying extended name $workdir.c, see --help\n";
    print "-----------------------------------------------------------------------------------------\n";
    $workdir .= ".c";
#    unless ($force) {
#	die;
#    }
}
while (-e "CheckSum.$outname.cs") {
    print "-----------------------------------------------------------------------------------------\n";
    print "Warning: result file CheckSum.$outname.cs is already existing, trying extended outname $outname.o\n";
    print "-----------------------------------------------------------------------------------------\n";
    $outname .= ".o";

#    die;

}


my @created = mkpath(   ## $created ?
			"$workdir",
			{verbose => 0, mode => 0750},
    );


##################################################
### read snp list into hash
####################################################


my $str = 0;
my $str_0 = 0;
my $ntot = 0;
my $nright = 0;


#my $sin = 0;

if (-e "CheckSum.$bfile.cs") {
    print "output (CheckSum.$bfile.cs) already existing\n";
    print "exit\n";
    exit;

}
unless (-e "$workdir/$bfile.bim.potr") {
    print "testing dataset format...\n";

## read ref file
    die "$refdir/$ref_file: ".$! unless open RF, "< $refdir/$ref_file";
    my %ref_hash;
    my %pos_hash;
    while (my $line = <RF>){
	my @cells = @{&split_line_ref(\$line)};
	my $pos = $cells[0]."_".$cells[3];
	$pos_hash{$pos} = $cells[1];
	$ref_hash{$cells[1]} = 1;
    }
    close RF;

    my %bim_hash;
    my @bim_arr;
    die "$bfile.bim: ".$! unless open BIM, "< $bfile.bim";
    while (my $line = <BIM>){
	my @cells = @{&split_line_ref(\$line)};
	$bim_hash{$cells[1]} = 1;
	push @bim_arr, $line;
    }
    
    close BIM;

### pos trans of bim-file

#die "$bfile.bim: ".$! unless open BIM, "< $bfile.bim";
    die "$workdir/$bfile.bim.potr: ".$! unless open PO, "> $workdir/$bfile.bim.potr";
    die "$workdir/$bfile.bim.potr: ".$! unless open PI, "> $workdir/$bfile.bim.potr.info";
    print PI "orig\tref\tpos\ttr\n";
    foreach my $line (@bim_arr) {
#while (my $line = <BIM>){
	my @cells = @{&split_line_ref(\$line)};
	my $pos = $cells[0]."_".$cells[3];
	$ntot++;
	if (exists $pos_hash{$pos}) {
	    if ($pos_hash{$pos} ne $cells[1]) {
		unless (exists $bim_hash{$pos_hash{$pos}}) { ## test if translated already exists
		    print PI "$cells[1]\t$pos_hash{$pos}\t$pos\t1\n";
		    $cells[1] = $pos_hash{$pos};
		    $str++;

		    $bim_hash{$cells[1]} = 1;
		}
		else {
		    print PI "$cells[1]\t$pos_hash{$pos}\t$pos\t0\n";
		    $str_0++;

		}
	    }
	    else {
		$nright++;
	    }
	}
#    if (exists $ref_hash{$cells[1]}) {#
#	$sin++;
#    }
	print PO "@cells\n";
    }
#close BIM;
    close PO;
    close PI;


    print "positional translated $str SNPs\n"; ### into log-file
    print "positional not_translated $str_0 SNPs\n"; ### into log-file
    print "$nright SNPs with matching position\n"; ### into log-file
    print "out of a total of $ntot SNPs\n"; ### into log-file
}
my $take_potr = 0;
if ($nright > $ntot/2) {
    print "keep positional translated SNPs\n";
    $take_potr = 1;
}
else {
    print "will not keep positional translated SNPs, positions are not trustworthy\n";
}
#exit;

## read recode_reference
die "$ra_file: ".$! unless open RA, "< $ra_file";
my %recode_hash;
while (my $line = <RA>){
    my @cells = @{&split_line_ref(\$line)};
    $recode_hash{$cells[0]} = $cells[1];
}
close RA;

## test how many SNPs are found in bim_file
my $sin = 0;
die "$workdir/$bfile.bim.potr: ".$! unless open BIM, "< $workdir/$bfile.bim.potr";
while (my $line = <BIM>){
    my @cells = @{&split_line_ref(\$line)};
    if (exists $recode_hash{$cells[1]}) {
	$sin++;
    }
}
close BIM;


if ($sin < 10) {
    print "-----------------------------------------------------------------------------------------------\n";
    print "Error: less than 10 SNPs found being overlapping with dataset, please make sure:\n";
    print "1) snp names: rs-names\n";
    print "2) whole genome data required, this script will not work on a handful of SNPs\n";
    print "-----------------------------------------------------------------------------------------------\n";
    exit;
}


print "Extracting batch SNPs....\n...";

#my $plinksys = "$plink_bin --noweb --bfile $bfile --extract $ra_file --make-bed --out $workdir/snps_for_checksum.$outname";
my $bimfile = "$workdir/$bfile.bim.potr";
if ($take_potr == 0) {
    $bimfile = "$bfile.bim";
}
my $plinksys = "$plink_bin --silent --noweb --bed $bfile.bed --fam $bfile.fam --bim $bimfile --extract $ra_file --make-bed --out $workdir/snps_for_checksum.$outname";

if ($keepf) {
    $plinksys .= " --keep $keepf";
}

unless (-e "$workdir/snps_for_checksum.$outname.bed") {
    my $status = &mysystem_nodie($plinksys);
    if ($status !=0) {
	print "-----------------------------------------------------------------------------------------\n";
	print "Error: this plink command failed: $plinksys\n\n";
	print "  the Error message above the dotted line does come from plink and does not suggest a change to id_genohecksum\n";
	print "  some suggestions:\n";
	print "       maybe you should work on a machine with more RAM, e.g. ish on Broad?\n";
	print "       as an alternative try using plink2 \n";
	print "                (on Broad: --ploc /home/unix/sripke/plink_src/plink_1.9_newest/)\n";
	
	print "------------------------------------------------------------------------------------------\n";
	print "removing workdir $workdir\n";
	&mysystem ("rm -r $workdir");
    exit;
    }
}


print "...done\n";

#exit;


print "Align batch SNPs....\n...";

my $plinksys = "$plink_bin --silent --noweb --bfile $workdir/snps_for_checksum.$outname --update-alleles $ua_file --make-bed --out $workdir/snps_for_checksum.$outname.sa";

unless (-e "$workdir/snps_for_checksum.$outname.sa.bed") {
    &mysystem($plinksys);
}
#else {
#    print "existing: $workdir/snps_for_checksum.$outname.sa.bed\n";
#}

print "...done\n";

#exit;


###########################################################
### check if recode-allele is present in all SNPs (flips before alignment)
#########################################################


my $preflips = 0;
die "$workdir/snps_for_checksum.$outname.bim: ".$! unless open BIM, "< $workdir/snps_for_checksum.$outname.bim";

while (my $line = <BIM>){
    my @cells = @{&split_line_ref(\$line)};
    if ($cells[4] ne $recode_hash{$cells[1]} && $cells[5] ne $recode_hash{$cells[1]}) {
	$preflips++;
    }
}
close BIM;

#print "$preflips preflips\n";
#exit;

###########################################################
### check if recode-allele is present in all SNPs (no flips left) after alignment
#########################################################


my $strandflips = 0;
my $strand_ex = "SNP\tA1\tA2\tref_Allele\n";
die "$workdir/snps_for_checksum.$outname.sa.bim: ".$! unless open BIM, "< $workdir/snps_for_checksum.$outname.sa.bim";

while (my $line = <BIM>){
    my @cells = @{&split_line_ref(\$line)};
    if ($cells[4] ne $recode_hash{$cells[1]} && $cells[5] ne $recode_hash{$cells[1]}) {
	$strandflips++;
	if ($strandflips < 6) {
	    $strand_ex .= "$cells[1]\t$cells[4]\t$cells[5]\t$recode_hash{$cells[1]}\n";
	}
    }
}
close BIM;

if ($strandflips > 0) {
    print "-----------------------------------------------------------------------------------------------------------\n";
    print "Error: $strandflips SNPs have mismatching allele names (e.g. CheckSum.$outname.cs.error), please make sure:\n";
    print "1) alleles being A,C,G,T\n";
    print "-----------------------------------------------------------------------------------------------------------\n";


    die "CheckSum.$outname.cs.error: ".$! unless open ERR, "> CheckSum.$outname.cs.error";
    
    print ERR "###### only listing the first 6 entries #########\n";    
    print ERR "$strand_ex\n";
    
    close ERR;
    die;
}
else {
    if (-e "CheckSum.$outname.cs.error") {
	&mysystem ("rm CheckSum.$outname.cs.error");
    }
}


#####################################################################
### check SNP missingness in resulting dataset
####################################################################

my $plinksys = "$plink_bin --silent --noweb --bfile $workdir/snps_for_checksum.$outname.sa --out $workdir/snps_for_checksum.$outname.sa.miss --missing";

unless (-e "$workdir/snps_for_checksum.$outname.sa.miss.lmiss") {
    &mysystem($plinksys);
}

my $missn = 0;

die "$workdir/snps_for_checksum.$outname.sa.miss.lmiss: ".$! unless open MI, "< $workdir/snps_for_checksum.$outname.sa.miss.lmiss";
die "$workdir/snps_for_checksum.$outname.sa.miss.lmiss.in: ".$! unless open MO, "> $workdir/snps_for_checksum.$outname.sa.miss.lmiss.in";
while (my $line = <MI>){
    my @cells = @{&split_line_ref(\$line)};
    if ($cells[4] < 0.1) {
	print MO "$cells[1]\n";
	$missn++;
    }
}
close MI;
close MO;

my $plinksys = "$plink_bin --silent --noweb --bfile $workdir/snps_for_checksum.$outname.sa --extract $workdir/snps_for_checksum.$outname.sa.miss.lmiss.in --make-bed --out $workdir/snps_for_checksum.$outname.sa.qc";

unless (-e "$workdir/snps_for_checksum.$outname.sa.qc.bed") {
    &mysystem($plinksys);
}


#####################################################################
### translate with strong LD friends (LD-R2 =1.0 over all 1KG pops)
####################################################################

### read dup file

die "$du_file: ".$! unless open DU, "< $du_file";
my %du_hash;
while (my $line = <DU>){
    my @cells = @{&split_line_ref(\$line)};

    my $haps = $cells[4];
    my $a1 = substr $haps,0,1;
    my $ta1 = substr $haps,1,1;
    my $a2 = substr $haps,3,1;
    my $ta2 = substr $haps,4,1;

    my $outline =  "$cells[2]\t$ta1\t$a1\t$ta2\t$a2\n";
    $du_hash{$cells[3]} = $outline;
#    print $cells[3]." ".$outline;

}
close DU;


my %bimsa_hash;
die "$workdir/snps_for_checksum.$outname.sa.qc.bim: ".$! unless open BSA, "< $workdir/snps_for_checksum.$outname.sa.qc.bim";
while (my $line = <BSA>){
    my @cells = @{&split_line_ref(\$line)};
    $bimsa_hash{$cells[1]} = 1;
}
close BSA;

my $dupn = 0;
### translate bim file
die "$workdir/snps_for_checksum.$outname.sa.qc.bim: ".$! unless open BI, "< $workdir/snps_for_checksum.$outname.sa.qc.bim";
die "$workdir/snps_for_checksum.$outname.sa.qc.dt.bim: ".$! unless open BO, "> $workdir/snps_for_checksum.$outname.sa.qc.dt.bim";
die "$workdir/snps_for_checksum.$outname.sa.qc.dt.bim.warnings: ".$! unless open BOW, "> $workdir/snps_for_checksum.$outname.sa.qc.dt.bim.warnings";

while (my $line = <BI>){
    my @cells = @{&split_line_ref(\$line)};
    if (exists $du_hash{$cells[1]}) {
#	print "would dup-translate $cells[1]\n";
#	print "$line";
	my @tce = @{&split_line_ref(\$du_hash{$cells[1]})};
	my %tal = ();
	$tal{$tce[1]} = $tce[2];
	$tal{$tce[3]} = $tce[4];

	
	unless (exists $tal{$cells[4]}) {
	    print "--------------------------------------------------------------\n";
	    print "Error: alleles $cells[1] and $tce[0] do not match with dup-file\n";
	    exit;
	}
	
	unless (exists $tal{$cells[5]}) {
	    print "--------------------------------------------------------------\n";
	    print "Error: alleles $cells[1] and $tce[0] do not match with dup-file\n";
	    exit;
	}

	if (exists $bimsa_hash{$tce[0]}) {
#	    print "--------------------------------------------------------------\n";
	    print BOW "Warning: target SNP $tce[0], $cells[1] in dup translation is already present in dataset\n";

#	    exit;
	}
	else {
	    $cells[1] = $tce[0];	
	    $cells[4] = $tal{$cells[4]};
	    $cells[5] = $tal{$cells[5]};
	    $bimsa_hash{$cells[1]} = 1;
	}		  
#	print "@cells\n";
	$dupn++;
    }
    print BO "@cells\n";
}
close BI;
close BO;
close BOW;

print "translated $dupn SNPs with strong LD friends\n";
#exit;


#exit;


print "looks good, starting with checksums now.....\n";

#exit;

###########################################################
### creating checksums
#########################################################

print "...working on the checksums\n";


my %id_cs;
my $failed_batches = 0;
my %fb ;

foreach my $i (0..$nbatch-1) {

    my $batch_file = "$refdir/$batch_templ";
    my $i_str = sprintf "%02d",$i;
    $batch_file =~ s/XXX/$i_str/;

   #### extract subset
    my $plinksys = "$plink_bin --silent --noweb --bed $workdir/snps_for_checksum.$outname.sa.qc.bed --fam $workdir/snps_for_checksum.$outname.sa.qc.fam --bim  $workdir/snps_for_checksum.$outname.sa.qc.dt.bim --recodeA --extract $batch_file --recode-allele $ra_file --out $workdir/CheckSum.$i.$outname.sa";
    
    unless (-e "$workdir/CheckSum.$i.$outname.sa.raw") {
	&mysystem($plinksys);
    }

    my $i1 = $i + 1;
    print "--------------------------------------------------------------------------\n";
    print "read $workdir/CheckSum.$i.$outname.sa.raw: batch $i1 out of $nbatch\n";
    print "--------------------------------------------------------------------------\n";

    die "$workdir/CheckSum.$i.$outname.raw: ".$! unless open RAW, "< $workdir/CheckSum.$i.$outname.sa.raw";

    my $ll=0;
    my $head = <RAW>;
    while (my $line = <RAW>){
	my $geno_str = "";
	my @cells = @{&split_line_ref(\$line)};
	my $cc = 0;
	foreach my $g (6..$#cells) {
	    if ($cells[$g] eq "NA") {
		$cells[$g] = 9;
	    }
	    $geno_str .= "$cells[$g]";
	    $cc++;
	}
	my $cks = `echo $geno_str | cksum`;
	if ($cc != 50) {
	    $cks = "---";
	    unless (exists $fb{$i}){
		$failed_batches++;
	    }
	    $fb{$i} = 1;
	}
	my @cells_cs = @{&split_line_ref(\$cks)};
	my $id_tmp = $cells[0]." ". $cells[1];
	$id_cs{$id_tmp} .= " ".$cells_cs[0];

	$ll++;


    }
    close RAW;

}

die "$workdir/CheckSum.$outname.cs: ".$! unless open CS, "> $workdir/CheckSum.$outname.cs";

foreach my $k (keys %id_cs) {
    print CS $k.$id_cs{$k}."\n";
}

close CS;


die "CheckSum.$outname.cs.log: ".$! unless open LOG, "> CheckSum.$outname.cs.log";

print LOG "successfully created checksums for $nbatch batches with 50 SNPs each\n";
print LOG "here some numbers\n";
print LOG "------------------\n";
print LOG "\nnumber of failed batches (not all SNPs found): $failed_batches\n";
print LOG "\nnumber of strand flipped aligned SNPs: $preflips\n";
print LOG "\nnumber of LD translated SNPs: $dupn\n";
print LOG "\nnumber of positional translated SNPs: $str\n";
print LOG "\nnumber of positional non-translated SNPs (position already taken): $str_0\n";


print LOG "\nthese files were used: $batch_templ\n";

close LOG;

&mysystem ("cp $workdir/CheckSum.$outname.cs .");
    
    print "###################################################################################\n";
    print "####################     SUCCESS    ###############################################\n";
    print "###################################################################################\n";
    print "## please share file CheckSum.$outname.cs with your collaborator    \n";
    print "## have a look at CheckSum.$outname.cs.log for more details         \n";
    print "## use id_geno_checktest for convenient overlap test / summary      \n";
    print "###################################################################################\n";

exit;