#!/usr/bin/env perl
use strict;

#############################
# load utility functions
#############################

use FindBin;
use lib "$FindBin::Bin";
use Ricopili::Utils qw(trans);

#############################
# read config file
#############################

#my $ploc = &trans("ploc");
my $p2loc = &trans("p2loc");
my $sloc = &trans("sloc");


srand(0);
my $nsout = 0;
my $nidout = 0;

##############################################
#### plink2 works now with this here
###########################################################################################################


my $version = "2.0.0";
my $progname = $0;
$progname =~ s!^.*/!!;

my $usage = "
Usage : $progname bprefix1 bprefix2 ....

version: $version

  --out STRING        outname, mandatory
  --help              print help massage and exits
  --nsout INT         number of SNPs in merged file, default = $nsout
                         if overlapping set is bigger, 
                         then random cutting down.
  --nidout INT        number of IDs in merged file, if not named, take all
  --sfile STRING      SNPs to do the merge with, first column
  --iefile STRING     Ids to exclude, fam-file-style
  --autos             only autosomal SNPs

  --flip              flip automatically, exclude ambiguous SNPs

  --ref STRING        based on this reference-file

  --b1ref             take first one as reference

  --debug             extended output


  here HM3 reference


   lisa: /home/gwas/pgc-samples/hapmap_ref/subchr/infosum_pos.sorted
   broad: /psych/genetics_data/ripke/references_outdated/hapmap_ref/subchr/infosum_pos.sorted

  here 1KG:
   /psych/genetics_data/ripke/references_outdated/hapmap_ref/impute2_ref/1KG_Mar12/ALL_1000G_phase1integrated_feb2012_impute/subchr/sumfrq.eur


 created by Stephan Ripke 2008 at MGH, Boston, MA
 
";




##########################################
# subroutine to split a plink-output-line
##########################################

sub split_line {
    my ($line)=@_;
    chomp($line);
    $line =~ s/^[\s]+//g;
    my @cols=  split /\s+/, $line;
}

##########################################
# subroutine to split a plink-output-line with references
##########################################

sub split_line_ref {
    my ($line)=${$_[0]};
    chomp($line);
    $line =~ s/^[\s]+//g;
    my @cols=  split /\s+/, $line;
    \@cols;
}


###################################################
###  system call with test if successfull
###################################################
sub mysystem(){
    my ($systemstr)="@_";
    system($systemstr);
    my $status = ($? >> 8);
    die "$systemstr\n->system call failed: $status" if ($status != 0);
}




########################################
# test plink and beagle
###################################


#unless (-e "$p2loc/plink" ){
#    print "\n***** Error: couldn't find the following:\n";
#    print "$p2loc/plink\n";
#    exit;
#}

my $reffile = "NOFILE";

use Getopt::Long;
GetOptions( "out=s"=> \my $out,
	    "nsout=i"=> \my $nsout,
	    "autos"=> \my $autos,
	    "flip"=> \my $flip_sw,
	    "b1ref"=> \my $b1ref,
	    "sfile=s"=> \my $sfile,
	    "iefile=s"=> \my $iefile,
	    "ref=s"=> \$reffile,
	    "nidout=i"=> \$nidout,
	    "debug"=> \my $debug,
	    "help!"=> \my $help );


die "$usage" if $help;
die "$usage" unless $out;


my @bfile_list= ();



if (@ARGV < 1){
    opendir(DIR, ".") || die "can't opendir .: $!";
    my @files = readdir(DIR);
    closedir DIR;
    @bfile_list = grep {/.bim$/} @files;
    print "take every bim file in this directory\n";

}
else {
    @bfile_list=@ARGV;
}


foreach (0..$#bfile_list){
    $bfile_list[$_]=~ s/.bim//;
}


if ($reffile ne "NOFILE") {
    
    foreach my $bele (@bfile_list) {
	print "commands for $bele:\n";
	print "checkpos5 --dbcol 1,8,9 --dbsnp $reffile $bele.bim\n";
	print "checkflip3 --sfh 0.2 --info $reffile $bele.ch.bim\n";
    }
    print "\n\ninvoke the above commands (copy first to README) and restart without --ref\n";
    exit;
    
}


use File::Path;
use Cwd;


my $rootdir = &Cwd::cwd();
my $workdir="$rootdir/bcomp_".$out;

if (1) {
while (-e $workdir) {
    $workdir .= ".b";
}
}




print "workdir: $workdir\n"  if ($debug);


my @created = mkpath(
    "$workdir",
    {verbose => 0, mode => 0750},
    );



chdir($workdir);


my @idarr = ();
my %count = ();


my %excl_ids ;

if ($iefile) {
    open INFILE, "$rootdir/$iefile" or die "($rootdir/$iefile) $!";
    while (<INFILE>) {
	chomp;
	my @cells = @{&split_line_ref(\$_)};
	$excl_ids{"$cells[0]\t$cells[1]"} = 1;
    }
    close (INFILE);
}




open INTERID, ">", "IDs.keep" or die $!;

#my %id_hash;

my $fc = 1;
my %am;
$am{"AT"} = 1;
$am{"CG"} = 1;
my %flip;
$flip{"AC"} = "GT";
$flip{"AG"} = "CT";
$flip{"CT"} = "AG";
$flip{"GT"} = "AC";
my %gts;

foreach my $bele (@bfile_list) {
    print "processing $bele.bim\n"  if ($debug);
    my $inbele = "$rootdir/$bele";
    if ($bele =~ /^\//) {
	$inbele = "$bele";
    }
    
    open INFILE, "< $inbele.bim" or die "($inbele.bim) $!";
    open FFILE, "> $inbele.bim.flip" or die "($inbele.bim.flip) $!";
    open FEFILE, "> $inbele.bim.fliperror" or die "($inbele.bim.fliperror) $!";
    while (<INFILE>) {
	chomp;

	my @cells = @{&split_line_ref(\$_)};
	if ($autos) {
	    next if ($cells[0] < 1 || $cells[0] > 22);
	}
	if ($fc > 1) {
	    next unless (exists $count{$cells[1]});
	}

	if ($flip_sw) {
	    my $a1 = $cells[4];
	    my $a2 = $cells[5];
	    if ($a2 lt $a1) {
		$a2 = $cells[4];
		$a1 = $cells[5];
	    }
	    my $gt = "$a1$a2";

	    ## exclude ambiguous
	    if (exists $am{$gt}) {
		next;
	    }

#	    if ($cells[1] eq "rs10000023") {
#		print "$inbele.bim\trs10000023\t$gt\n";
#	    }
	    
	    ### test if existing and nonidentical
	    if (exists $gts{$cells[1]}) {
		unless ( $gts{$cells[1]} eq $gt) {
		    ## test if it is the flip
		    if ( $gts{$cells[1]} eq $flip{$gt}) {
#			print "found a flipped strand at $cells[1]\n";
			print FFILE "$cells[1] $gt $gts{$cells[1]}\n";
		    }
		    else {
			print FEFILE "Error: alleles do not match on $cells[1] $gt, from file1: $gts{$cells[1]}\n";
			next;
		    }
		}
	    }
	    else {
		$gts{$cells[1]} = $gt;
	    }
	}

	$count{$cells[1]}++;
    }
    close (INFILE);
    close (FFILE);
    close (FEFILE);


    if ($nidout) {
	open INFILE, "$inbele.fam" or die "($inbele.fam) $!";
	my $first = <INFILE>;
	chomp($first);
#	push @idarr, $first;
	print INTERID "$first\n";
#	$id_hash{$first} = 1;

	while (<INFILE>) {
	    my @cells = @{&split_line_ref(\$_)};
	    next if (exists $excl_ids{"$cells[0]\t$cells[1]"});
	    chomp;
	    push @idarr, $_;
	}
	close (INFILE);
    }


    $fc++;
}

#print "debug\n";
#exit;


my %shash = ();
if ($sfile) {
    open INFILE, "$rootdir/$sfile" or die "($rootdir/$sfile) $!";
    while (<INFILE>) {
	chomp;
	my @cells = @{&split_line_ref(\$_)};
	$shash{$cells[0]} = 1 ;
    }
    close (INFILE);
}

my @common_snps;
my $n_bfiles = @bfile_list;
foreach my $element (keys %count) {
    if ($sfile){
	push @common_snps, "$element" if (exists $shash{$element});
    }
    else {
	if ($count{$element} == $n_bfiles) {
	    push @common_snps, "$element";
	}
    }
}


if (@common_snps == 0) {
    foreach my $bele (@bfile_list) {
	my $inbele = "$rootdir/$bele";
	if ($bele =~ /^\//) {
	    $inbele = "$bele";
	}
	my $tmp_out  = `wc -l $inbele.bim`;
	print "$tmp_out";
    }
    &mysystem ("touch $rootdir/$out.fini");
    print "no overlapping SNPs left\n";
    exit;
}

my $rand_th_snps = 1;
if ($sfile) {
    $rand_th_snps = 1.1;
}
else {
    $rand_th_snps = $nsout / @common_snps;
}

if ($nsout == 0) {
    $rand_th_snps = 1.1;
}

my $nids = @idarr;
$nids++;

my $rand_th_ids = 1;
if ($nidout == 0) {
    $rand_th_ids = 1.1 ;
}
else {
    $rand_th_ids = $nidout / $nids;
}



#print "n_common_snps: ".@common_snps.", rand_th_snps: $rand_th_snps"."\n";
#exit;
#	print "lkdsjlk@bfile_list\n";		
print "print Intersection-List\n"  if ($debug);


###############################################
### print intersecting SNP list
#################################################

my $s1_count = 0;
open INTER, ">", "SNPs.intersec" or die $!;
foreach my $sname (@common_snps) {
    if (rand() < $rand_th_snps){
	print INTER "$sname\n";
	$s1_count++;
    }
}
close (INTER);

#print "debug\n";
#exit;

###############################################
### print IDs for output
#################################################

my $cc = 0;
foreach my $iname (@idarr) {
    if (rand() < $rand_th_ids){
	print INTERID "$iname\n" ;
	$cc++;
    }
}
close (INTERID);
print "$cc in final dataset\n"  if ($debug);


my $keep_str = "";
$keep_str = "--keep IDs.keep" if ($nidout > 0);
my $remove_str = "";
$remove_str = "--remove $rootdir/$iefile" if ($iefile);



my $fbfile = $bfile_list[0];

#################################################
#### create reference
##################################################


if ($b1ref) {

    print "create reference\n";
    print "create freq\n";

    my $cmd="$p2loc/plink --silent --memory 2000 --bfile $rootdir/$fbfile --out $fbfile.f --freq --extract SNPs.intersec" ;
    &mysystem ("$cmd");

    my $header= <INFILE>;

    print "reformat freq\n";

    open INFILE, "< $fbfile.f.frq" or die "($fbfile.f.frq) $!";
    open OFILE, "> $fbfile.frq.ref" or die "($fbfile.frq.ref) $!";
    while (<INFILE>) {
	chomp;
	my @cells = @{&split_line_ref(\$_)};
	print OFILE "$cells[1]";
	print OFILE "\t$cells[5]";
	print OFILE "\t$cells[2]";
	print OFILE "\t$cells[4]";
	print OFILE "\t$cells[3]";
	printf OFILE "\t%f",1-$cells[4];
	printf OFILE "\t%f",$cells[5] * $cells[4];
	print OFILE "\t$cells[0]";
	print OFILE "\n";

    }
    close (INFILE);
    close (OFILE);

    print "sort freq\n";

    my $cmd="sort -b -k 1,1 $fbfile.frq.ref > $fbfile.frq.ref.sorted" ;
    &mysystem ("$cmd");


    print "reformat bim\n";

    open INFILE, "< $rootdir/$fbfile.bim" or die "($rootdir/$fbfile.bim) $!";
    open OFILE, "> $fbfile.bp" or die "($fbfile.bp) $!";
    while (<INFILE>) {
	chomp;
	my @cells = @{&split_line_ref(\$_)};
	print OFILE "$cells[1]";
	print OFILE "\t$cells[3]";
	print OFILE "\n";
    }
    close (INFILE);
    close (OFILE);

    print "sort bim\n";

    my $cmd="sort -b -k 1,1 $fbfile.bp > $fbfile.bp.sorted" ;
    &mysystem ("$cmd");

    print "join to new reference\n";

    &mysystem("join $fbfile.frq.ref.sorted $fbfile.bp.sorted > $fbfile.ref.bp.sorted");
    
#    sleep(10);

    $reffile = "$workdir/$fbfile.ref.bp.sorted";
    &mysystem ("cp $workdir/$fbfile.ref.bp.sorted $rootdir");

}

my @bfile_short;
foreach my $bele (@bfile_list) {
    my $btemp = $bele;
    $btemp =~ s/.*\///;
    push @bfile_short, $btemp;
    print "$bele, $btemp\n"  if ($debug);
}
#exit;



#################################################
#### create single study-subsets
##################################################

my @s2_bfiles;

my $merge_list_name="MERGE-LIST-$out";
open MERGE, ">", "$merge_list_name" or die $!;
my $bfile_count=0;
my $bs_count=0;
foreach my $bele (@bfile_list) {
    print "create single-sub of $bele"."\n"  if ($debug);

    my $b_short = $bfile_short[$bs_count];

    my $inbele = "$rootdir/$bele";
    if ($bele =~ /^\//) {
	$inbele = "$bele";
    }



    if ($flip_sw) {
    
	my $cmd = "$p2loc/plink --silent --memory 2000 --bfile $inbele --out $b_short.tmp --make-bed $remove_str $keep_str --extract SNPs.intersec";
	print "1: $cmd\n"  if ($debug);
	&mysystem ($cmd);

	$cmd = "$p2loc/plink --silent --memory 2000 --bfile $b_short.tmp --make-bed --out $b_short --flip $rootdir/$bele.bim.flip";
	print "2: $cmd\n"  if ($debug);
	&mysystem ($cmd);

	&mysystem ("rm $b_short.tmp.bed");
	&mysystem ("rm $b_short.tmp.bim");
	&mysystem ("rm $b_short.tmp.fam");


	

    }
    else {

	my $cmd = "$p2loc/plink --silent --memory 2000 --bfile $inbele --out $b_short --make-bed $remove_str $keep_str --extract SNPs.intersec";
	&mysystem ($cmd);
	
    }
    
#    my $testdir = &Cwd::cwd();
#    print "$testdir\n";
#    print "$cmd\n";





    


#    exit;
    if ($reffile ne "NOFILE") {
	print "commands for $bele:\n";
	print "checkpos5 --dbcol 1,8,9 --dbsnp $reffile $bele.bim\n";
	print "checkflip3 --sfh 0.2 --info $reffile $bele.ch.bim\n";

	if (0) {
	    &mysystem ("checkpos5 --dbcol 1,8,9 --dbsnp $reffile $bele.bim");
	    &mysystem ("mv $bele.ch.report $bele.ch.tar.gz $rootdir");
	    &mysystem ("checkflip3 --sfh 0.2 --info $reffile $bele.ch.bim");
	    &mysystem ("mv $bele.ch.fl.report $bele.ch.fl.tar.gz $rootdir");
	    print MERGE "$bele.ch.fl.bed $bele.ch.fl.bim $bele.ch.fl.fam\n" if $bfile_count++ > 0 ;
	    push @s2_bfiles, "$bele.ch.fl";
	}
    }
#    elsif ($b1ref) {
#	my $cmd = "checkflip2 --sfh 0.2 --info $workdir/$fbfile.ref.bp.sorted $bele.bim";
#	print "$cmd\n";
#	&mysystem($cmd);
#	&mysystem ("mv $bele.fl.report $bele.fl.tar.gz $rootdir");
#	print MERGE "$bele.fl.bed $bele.fl.bim $bele.fl.fam\n" if $bfile_count++ > 0 ;
#    }
    else {
	print MERGE "$b_short.bed $b_short.bim $b_short.fam\n" if $bfile_count++ > 0 ;
    }
    $bs_count++;
}
close (MERGE);

#exit 2;


print "Merging now\n"  if ($debug);


if ($reffile ne "NOFILE") {
    $fbfile .= ".ch.fl"
}


my $fbfile_short = $fbfile;
$fbfile_short =~ s/.*\///;

my $merge_txt = "--merge-list $merge_list_name";
if (@ARGV == 1) {
    $merge_txt = "";
}

my $system="$p2loc/plink --silent --memory 2000 --allow-no-sex --bed $fbfile_short.bed --bim $fbfile_short.bim --fam $fbfile_short.fam $merge_txt --out $out.s1 --make-bed" ;
#    print "$system\n";
&mysystem ("$system");


###############################################
### renew interesecting list
#################################################

my $s2_count = 0;
if ($reffile ne "NOFILE") {
    print "evaluate intersection of S2\n";
    my %count_s2;
    my $fc = 1;
    foreach my $bele_s2 (@s2_bfiles) {
	print "processing $bele_s2.bim\n";
	
	open INFILE, "$bele_s2.bim" or die "($bele_s2.bim) $!";
	while (<INFILE>) {
	    chomp;
	    my @cells = @{&split_line_ref(\$_)};
	    if ($fc > 1) {
		next unless (exists $count{$cells[1]});
	    }
	    $count_s2{$cells[1]}++;
	}
	close (INFILE);
	
	$fc++;
    }
    
    my @common_snps_s2;
    my $n_bfiles_s2 = @s2_bfiles;
    foreach my $element (keys %count_s2) {
	if ($count_s2{$element} == $n_bfiles) {
	    push @common_snps_s2, "$element";
	}
    }
    
    
    open INTER, ">", "SNPs.intersec.s2" or die $!;
    foreach my $sname (@common_snps_s2) {
	print INTER "$sname\n";
	$s2_count++;
    }
    close (INTER);

}
else {
    $s2_count = $s1_count;
}

if ($s2_count != $s1_count) {
    print "create stage2\n";
    my $system="$p2loc/plink --silent --memory 2000 --bfile $out.s1 --out $out --extract SNPs.intersec.s2 --make-bed" ;
    &mysystem ($system);
}
else {
    print "no further adjustment on overlap\n"  if ($debug);
    &mysystem ("mv $out.s1.bed $out.bed");
    &mysystem ("mv $out.s1.bim $out.bim");
    &mysystem ("mv $out.s1.fam $out.fam");
}
#    print "$system\n";



print "cleaning up\n"  if ($debug);

&mysystem ("mv $out.bim $rootdir");
&mysystem ("mv $out.bed $rootdir");
&mysystem ("mv $out.fam $rootdir");
&mysystem ("touch $rootdir/$out.fini");


#exit;

&mysystem ("rm -rf $workdir");

exit 0;

