#!/usr/bin/env perl
use strict;

#############################
# load utility functions
#############################

use FindBin;
use lib "$FindBin::Bin";
use Ricopili::Utils qw(trans);

my $version = "1.0.0";
my $progname = $0;
$progname =~ s!^.*/!!;


#############################
# read config file
#############################

my $liloc = &trans("liloc");




#####################################################
use lib $ENV{rp_perlpackages};



my @bu_files;
push @bu_files, "$liloc/snp.txt.pos.scz49.gz";
push @bu_files, "$liloc/snp125.txt.pos.scz49.gz";
push @bu_files, "$liloc/snp130.txt.pos.scz49.gz";
push @bu_files, "$liloc/snp138.txt.pos.scz49.gz";

my @li_files;
push @li_files, "$liloc/hg16ToHg19.over.chain.gz";
push @li_files, "$liloc/hg17ToHg19.over.chain.gz";
push @li_files, "$liloc/hg18ToHg19.over.chain.gz";
push @li_files, "$liloc/hg19ToHg19.over.chain.gz"; ## fake


##### help message
my $usage = "
Usage : $progname bim-file

version: $version

  -help            print this message and exit

  --lift19         lift dataset to hg19
  --noclean        do not remove temporary files

  --debug          extended ouput



 guesses the build of a bim file out of ucsc snp file

  find here the helping files:
    $liloc

 created by Stephan Ripke 2014 at MGH, Boston, MA
 in the frame of the PGC

";

use Getopt::Long;
GetOptions( 
    "help"=> \my $help,
    "debug"=> \my $debug,
    "lift19"=> \my $lift19,
    "noclean"=> \my $noclean,
    );


die ("wrong: $ARGV\n$usage") if ($help || @ARGV != 1);



##########################################
# subroutine to split a plink-output-line with references
##########################################

sub split_line_ref {
    my ($line)=${$_[0]};
    chomp($line);
    $line =~ s/^[\s]+//g;
    my @cols=  split /\s+/, $line;
    \@cols;
}

###################################################
###  system call with test if successfull
###################################################

sub mysystem(){
    my ($systemstr)="@_";
    system($systemstr);
    my $status = ($? >> 8);
    die "$systemstr\n->system call failed: $status" if ($status != 0);
}




my $bim_file = $ARGV[0];

unless ($bim_file =~ /.bim$/) {
    $bim_file .= ".bim";
}

my $bfile = $bim_file;
$bfile =~ s/.bim$//;

my %bpos=();

use Compress::Zlib ;

## read bim-file
print "read bim-file positons\n" if ($debug);
die "$!:$bim_file" unless open BIM, "< $bim_file";   
while (<BIM>) {
    my @cells = @{&split_line_ref(\$_)};
    my $pos = $cells[0]." ".$cells[3];
#    $bpos{$cells[1]} = $pos;
    $bpos{$pos} = 1;
}
close BIM;

my $nbim=keys(%bpos);
print "size of bim-file: $nbim\n\n" if ($debug);




my $max_comp = 50000000;
my $min_noma = 100000000;
my $max_ma = 0;
my $buig;

print "start comparison\n" if ($debug);



die $!."($bim_file).buigue.noma_comp" unless open NC, "> $bim_file.buigue.noma_comp";


print NC "build";
#print NC "\tN_comp";
#print NC "\tN_nocomp";
print NC "\tN_match";
print NC "\tN_nomatch";
print NC "\n";
my $bucount = 0;
my $licount = 0;

my @noma_files;

foreach my $bufile (@bu_files) {
    ## compare with snp-collection


    my %test_bpos = %bpos;
    my $bush = $bufile;
    $bush =~ s/.*snp/snp/;
    $bush =~ s/.txt.pos.gz//;
#    print "$bush\n";
#    exit;
    print "comparing with $bufile\n" if ($debug);
    
    my $nmatch= 0;
    my $nnomatch= 0;
#    my $ncomp= 0;
#    my $nnocomp= 0;

    my $sc = gzopen($bufile, "rb")  or die "Cannot open $bufile: $gzerrno\n" ;
    die $!."($bim_file).$bush.noma" unless open NOMA, "> $bim_file.$bush.noma";
    push @noma_files, "$bim_file.$bush.noma";
    
    while ($sc->gzreadline(my $line) > 0) {

	my @cells = @{&split_line_ref(\$line)};


#	if (exists $bpos{$cells[0]}) {
	my $pos = $cells[1]." ".$cells[3];

#	    if ($cells[0] eq "rs3131972") {
#		print "rs3131972:\n";
#		print "$pos\n";
#		print "$bpos{$cells[0]}\n";
#	    }



	if (exists $test_bpos{$pos}) {
#	    if ($pos eq $bpos{$cells[0]}) {
	    $nmatch++;
	    delete $test_bpos{$pos};
	}
	#	    $ncomp++;
	#	}
	#	else {
	#	    $nnocomp++;
	#	}
	#	last if ($ncomp == $max_comp);
    }
    foreach my $posx (keys %test_bpos) {
	print NOMA "$posx\n";
	$nnomatch++;
    }
    close NOMA;
    $sc->gzclose();

#    if ($ncomp == 0) {
#	print "Error: no SNPs were compared to reference SNPs, check format of your bim file $bim_file\n";
#	exit;
 #   }

    print NC "$bufile";
#    print NC "\t$ncomp";
#    print NC "\t$nnocomp";
    print NC "\t$nmatch";
    print NC "\t$nnomatch";
    print NC "\n";
    if ($nmatch > $max_ma) {
#    if ($nnomatch < $min_noma) {
	$min_noma = $nnomatch;
	$buig = $bush;
	$licount = $bucount;
	$max_ma = $nmatch;
    }

    $bucount++;
}
close NC;

if ($max_ma == 0) {
    print "Error: maximum of matched positions is 0, plese check format of $bim_file\n";
    exit;
}


my $cmd_out = "no liftover necessary to get to hg19\n";
if ($lift19){
    if ($licount < 3) {
	my $sys_str = "lift18219 --noex --lilofile $li_files[$licount] $bim_file";
	#	print "$sys_str\n";
	$cmd_out = $sys_str;
	&mysystem($sys_str);
    }
    else {
	&mysystem("ln -s $bfile.bed $bfile.hg19.bed");
	&mysystem("ln -s $bfile.bim $bfile.hg19.bim");
	&mysystem("ln -s $bfile.fam $bfile.hg19.fam");
    }
}

die $!."($bim_file).buigue.liftover_script" unless open BU, "> $bim_file.buigue.liftover_script";
print BU "$cmd_out\n";
close BU;



die $!."($bim_file).buigue" unless open BU, "> $bim_file.buigue";
print BU "$buig\n";
close BU;
print "success: $bim_file.buigue\n" if ($debug);

&mysystem("touch $bim_file.fini");


unless ($noclean) {
    foreach my $nomafile (@noma_files) {
	&mysystem("rm $nomafile");
    }
}

