#!/usr/bin/perl
use strict;

use Compress::Zlib ;

my $out_name = "metaber";

my $info_th = "0.1";

#############################
# read config file
#############################

my $conf_file = $ENV{HOME}."/ricopili.conf";
my %conf = ();

die $!."($conf_file)" unless open FILE, "< $conf_file";
while (my $line = <FILE>){
    my @cells = split /\s+/, $line;
    $conf{$cells[0]} = $cells[1];
}
close FILE;

sub trans {
    my ($expr)=@_;
    unless (exists $conf{$expr}) {
	die "config file without entry: $expr\n";
    }
    $conf{$expr};
}

#my $ploc = &trans("ploc");
my $sloc = &trans("sloc");
my $meloc = &trans("meloc");
my $rloc = &trans("rloc");

#################################################
my $r_sys = "$rloc/R";
if ($rloc eq "broadinstitute") {
    $r_sys = "source /broad/software/scripts/useuse; use R-2.14; R";
}


use Getopt::Long;
GetOptions( 
   "out=s"=> \$out_name,
   "info_th=s"=> \$info_th,
#   "beta"=> \my $beta,

   "help"=> \my $help,
   "nofilter"=> \my $nofilter,
   "detout"=> \my $detout,
   "xdf"=> \my $xdf,

 );

if ($help || $out_name eq "metaber"){
    print "usage: $0 danerfile1 danerfile2 ....

      options:

	--help             print this message and exit
        --info_th STRING   threshold for info-score to get into met-analysis
                             defaut $info_th
                             !!! use zero before period  !!!
        --nofilter         no filter at all
#        --beta             if there is BETA instead of OR present

         --detout          also include a detailed out-file with all P, OR, SE

        --xdf              do a xdf-P instead of weighted meta
        --out STRING       outname


 works with files coming out of daner (incl. ngt)
 uses metal

 --out is mandatory

 created by Stephan Ripke 2010 at MGH, Boston, MA
 in the frame of the PGC
\n";
    exit 2;
}

#my $out_name = "metaber_".$out_name;


###################################################
###  system call with test if successfull
###################################################

sub mysystem(){
    my ($systemstr)="@_";
    system($systemstr);
    my $status = ($? >> 8);
    die "$systemstr\n->system call failed: $status" if ($status != 0);
}


#####################################
# subroutine to count lines of a file
#####################################

sub count_lines {
    my ($file)=@_;
    my $lc=0;
    die "$file: ".$! unless open FILE, "< $file";
    while (<FILE>){
	$lc++;
    }
    close FILE;
    $lc;
}


##########################################
# subroutine to split a plink-output-line
##########################################

sub split_line {
    my ($line)=@_;
    chomp($line);
    $line =~ s/^[\s]+//g;
    my @cols=  split /\s+/, $line;
}


#####################################
# print array to file
####################################

sub a2file {
    my ($file, @lines)=@_;
    die "$!: $file" unless open FILE, "> $file";
    foreach (@lines){
	print FILE $_;
    }
    close FILE;
}



########################################
# test plink and beagle
###################################


#unless (-e "$ploc/plink" ){
 #   print "\n***** Error: couldn't find the following:\n";
  #  print "$ploc/plink\n";
   # exit;
#}



#####################################
### BEGIN
#####################################

my @forest_files=();

#####################################
### perform meta, so we see, which SNPs come out of it.
#####################################


#print "$ploc/plink --out $out_name.comb --meta-analysis @ARGV\n";
#exit;
my $metal_script_name = "$out_name.metal";
my $metal_script = "SCHEME STDERR\n";

if (@ARGV == 1) {
    my $file = $ARGV[0];
    if ($file =~ /.gz$/) {
	&mysystem ("cp $file $out_name.metadaner.gz");
    }
    else {
	&mysystem ("gzip -c $file > $file.gz");
	&mysystem ("cp $file.gz $out_name.metadaner.gz");
    }
    exit;
}


my $or_txt = "";
my $die = 0;

foreach my $file (@ARGV) {

    my $df = gzopen("$file", "rb")  or die "Cannot open $file: $gzerrno\n" ;
    $df->gzreadline(my $line);
    my @cells = &split_line($line);

    my $sollc = @cells;
    my $ld_sw = 0;

# CHR         SNP          BP  A1  A2     FRQ    INFO      OR      SE       P
# CHR         SNP          BP  A1  A2   FRQ_A_276   FRQ_U_521    INFO      OR      SE       P   ngt

    foreach (@cells) {
	### check for OR column, but also for the first file
	if ($_ eq "BETA") {
	    die "OR and BETA in header, or file-inconsistency: $file" if ($or_txt eq "log(OR)");
	    $or_txt = "BETA";
	}
	if ($_ eq "OR") {
	    die "OR and BETA in header, or file-inconsistency: $file" if ($or_txt eq "BETA");
	    $or_txt = "log(OR)";
	}
	if ($_ eq "R2-LD") {
	    $ld_sw = 1;
	}
    }
    die "no OR or BETA in header\n@cells\n" if ($or_txt eq "");

    my $cc = 0;
    my $line = "ha";
#    print "new gfil\n";
    while ($df->gzreadline(my $line_loc)){
	$cc++;
	chomp($line_loc);
#	print "rig: ->$line_loc<-\n";
	unless ($line_loc eq "") {
	    $line = $line_loc;
	}
    }
#    print "file: $file: $cc\n";
#    print "line: <$line>: $cc\n";

    my @cells = &split_line($line);
    if ($sollc != @cells && $line ne "ha") {
#	print @cells." ncells\n";
#	print "->".$line."<- line\n";
#	exit;
	my $cc_loc = @cells;
	&mysystem ("rm $file");
	print "column count problem in $file, $sollc, $cc_loc,\n@cells\n" ;
	$die = 1;

    }
    
    $df -> gzclose();

    my $filtertxt = "ADDFILTER INFO > 0$info_th";
    if ($ld_sw == 1){
	$filtertxt = "ADDFILTER R2-LD > 0$info_th";
    }
    if ($nofilter) {
	$filtertxt = "";
    }

my $process_templ ="
MARKER   SNP
WEIGHT   N
ALLELE   A1 A2
FREQ     FRQ_A_218
EFFECT   $or_txt
STDERR   SE
PVAL     P
$filtertxt

PROCESS $file
";

$metal_script .= $process_templ;
}

if ($die == 1){
    die "column count problem\n" ;
}


#exit;
$metal_script .= "OUTFILE $out_name.meta .tbl\n";
$metal_script .= "ANALYZE HETEROGENEITY";
#ANALYZE HETEROGENEITY

#print "$metal_script\n";

&a2file ($metal_script_name,$metal_script);
&mysystem ("$meloc/metal < $metal_script_name");

#&mysystem ("$ploc/plink --out $out_name.comb --meta-analysis @ARGV");
#exit;
my $nca = 0;
my $nco = 0;

my %fca ;
my %nca ;
my %fco ;
my %nco ;
my %ngt ;
my %nmet ;
my %info ;
my %a1 ;
my %a2 ;
my %chr ;
my %pos ;
my %p_str ;
my %det_str ;



# CHR         SNP          BP  A1  A2   FRQ_A_218   FRQ_U_323    INFO      OR      SE       P   ngt

#####################################
### read all single files 
#####################################

my $fcount = 0;
my %col = ();
foreach my $dfile (@ARGV) {

    my $df = gzopen("$dfile", "rb")  or die "Cannot open $dfile: $gzerrno\n" ;
    $df->gzreadline(my $line);
    my @cells = &split_line($line);


    my $nca_loc;
    my $nco_loc;
 #   my $nf_loc;


# CHR         SNP          BP  A1  A2     FRQ    INFO      OR      SE       P
# CHR         SNP          BP  A1  A2   FRQ_A_276   FRQ_U_521    INFO      OR      SE       P   ngt
    %col = ();
    my $cc =0;
    foreach (@cells) {
	$col{$_} = $cc;
	$cc++;
    }

#    my $chr_col = 0;
#    my $snp_col = 1;
#    my $pos_col = 2;
#    my $a1_col = 3;
#    my $a2_col = 4;
#    my $fa_col = 5;
#    my $fu_col = 6;
#    my $info_col = 7;
#    my $or_col = 8;
#    my $se_col = 9;
#    my $p_col = 10;
#    my $ngt_col = 11;


#FRQ_Cases FRQ_Controls

    my $cc =0;
    foreach (@cells) {

	if ($_ =~ /FRQ_Cases_/) {
	    $nca_loc = $_;
	    $nca_loc =~ s/FRQ_Cases_//;
	    $col{"FRQA"} = $cc;
	}
	if ($_ =~ /FRQ_Controls_/) {
	    $nco_loc = $_;
	    $nco_loc =~ s/FRQ_Controls_//;
	    $col{"FRQU"} = $cc;
	}

	if ($_ =~ /FRQ_A_/) {
	    $nca_loc = $_;
	    $nca_loc =~ s/FRQ_A_//;
	    $col{"FRQA"} = $cc;
	}
	if ($_ =~ /FRQ_U_/) {
	    $nco_loc = $_;
	    $nco_loc =~ s/FRQ_U_//;
	    $col{"FRQU"} = $cc;
	}

	if ($_ = /MAF/) {
	    $col{"FRQ"} = $cc;
	}

#	if ($_ eq "FRQ") {
#	    $nf_loc = $_;
#	    $nco_loc =~ s/FRQ_U_//;
#	}
	$cc++;
    }

    unless (exists $col{"FRQA"}){
	print "$dfile; no FRQ_A, so I take FRQ\n";


	if (exists $col{"FRQ"}){
	    print "found FRQ\n" ;
	    $col{"FRQA"} = $col{"FRQ"};
	}
	elsif (exists $col{"FRQU"}) {	    
	    print "found FRQU\n" ;
	    $col{"FRQA"} = $col{"FRQU"};
	}
	else {
	    die "no FRQ column" ;
	}


    }
    unless (exists $col{"FRQU"}){
	print "$dfile: no FRQ_U, so I take FRQ\n";
	die "no FRQ column" unless (exists $col{"FRQ"});
	$col{"FRQU"} = $col{"FRQ"};
    }

    unless (exists $col{"FRQU"}){
	print "no frequency (unaff.) column in $dfile\n";
	die;
    }

    unless (exists $col{"FRQA"}){
	print "no frequency (aff.) column in $dfile\n";
	die;
    }

    my $no_fca = 0;
    if ($nca_loc == 0){
	$nca_loc = 100;
	$no_fca = 1;
    }
    $nco_loc = 100 if ($nco_loc == 0);


    $nca += $nca_loc;
    $nco += $nco_loc;


    my $ngt_sw = 1;
    $ngt_sw = 0 unless (exists $col{"ngt"});

    while ($df->gzreadline(my $line)){
	chomp($line);
	my @cells = &split_line($line);
#	print $cells[1]."\n";
	## read basic information
#	if ($fc == 1) {
	my $flip = 0;
	unless (exists $chr{$cells[$col{"SNP"}]}){
	    $a1{$cells[$col{"SNP"}]} = $cells[$col{"A1"}];
	    $a2{$cells[$col{"SNP"}]} = $cells[$col{"A2"}];
	    $chr{$cells[$col{"SNP"}]} = $cells[$col{"CHR"}];
	    if (exists $col{"POS"}) {
		$pos{$cells[$col{"SNP"}]} = $cells[$col{"POS"}];
	    }
	    else {
		$pos{$cells[$col{"SNP"}]} = $cells[$col{"BP"}];
	    }

	}
	else {
	    unless ($cells[$col{"A1"}] eq $a1{$cells[$col{"SNP"}]}){
		$flip = 1;
	    }
	}
#	}

	my $info_loc = $cells[$col{"INFO"}];

#	print $info_loc."\t";
#	print $info_th."\n";
	if ($info_loc < $info_th){
#	    $p_str{$cells[$col{"SNP"}]} .= "\tNA";

	    if ($detout) {
		$det_str{$cells[$col{"SNP"}]} .= "\t-";
		$det_str{$cells[$col{"SNP"}]} .= "\t-";
		$det_str{$cells[$col{"SNP"}]} .= "\t-";
	    }
	    next ;
	}

	my $tmp = $cells[$col{"FRQA"}];
#	print "freq: $tmp\n";

	my $fca_loc = $cells[$col{"FRQA"}];
	my $fco_loc = $cells[$col{"FRQU"}];
#	print $cells[$col{"SNP"}].": $fca_loc\t$fco_loc\n";
	if ($no_fca){
	    $fca_loc = $cells[$col{"FRQU"}] * $cells[$col{"OR"}];
	}
#	print $cells[$col{"SNP"}].": $fca_loc\t$fco_loc\n\n";
	if ($flip == 1){
	    $fca_loc = 1 - $fca_loc;
	    $fco_loc = 1 - $fco_loc;
#	    print "FLIP\n";
	}

	$nca{$cells[$col{"SNP"}]} += $nca_loc;
	$nco{$cells[$col{"SNP"}]} += $nco_loc;
	$fca{$cells[$col{"SNP"}]} += $fca_loc * $nca_loc;
	$fco{$cells[$col{"SNP"}]} += $fco_loc * $nco_loc;
	$info{$cells[$col{"SNP"}]} += $cells[$col{"INFO"}] * $nco_loc;
	if ($ngt_sw ==1) {
	    $ngt{$cells[$col{"SNP"}]} += $cells[$col{"ngt"}] ;
	}
	$nmet{$cells[$col{"SNP"}]}++;

	$p_str{$cells[$col{"SNP"}]} .= "\t".$cells[$col{"P"}];

	if ($detout) {
	    $det_str{$cells[$col{"SNP"}]} .= "\t".$cells[$col{"P"}];
	    $det_str{$cells[$col{"SNP"}]} .= "\t".$cells[$col{"OR"}];
	    $det_str{$cells[$col{"SNP"}]} .= "\t".$cells[$col{"SE"}];
	}

    }

    $df -> gzclose();
    $fcount++;
}

print "$nca\n";
print "$nco\n";
#print "debug\n";
#sleep(10);


################################################
#############################################
### DETOUT
################################################
###############################################

my %pxdf;

################################################
### write p-values for xdf test
###############################################

if ($xdf) {
    die "$out_name.detout: $!" unless open PSTR, "> $out_name.detout"; 
    foreach (keys %p_str) {
	
#    print PSTR $_.$p_str{$_};
	print PSTR $p_str{$_};
	if ($nmet{$_} != $fcount) {
	    foreach my $cloc (1..($fcount - $nmet{$_})){
		print PSTR "\tNA";
	    }
	}
	print PSTR "\n";
    }
    
    close PSTR;
    
    my $nxdf = keys %p_str;
}
    
################################################
#############################################
### XDF
################################################
###############################################

my %pxdf;

################################################
### write p-values for xdf test
###############################################

if ($xdf) {
    die "$out_name.p_str: $!" unless open PSTR, "> $out_name.p_str"; 
    foreach (keys %p_str) {
	
#    print PSTR $_.$p_str{$_};
	print PSTR $p_str{$_};
	if ($nmet{$_} != $fcount) {
	    foreach my $cloc (1..($fcount - $nmet{$_})){
		print PSTR "\tNA";
	    }
	}
	print PSTR "\n";
    }
    
    close PSTR;
    
    my $nxdf = keys %p_str;
    
    
################################################
### perform xdf test in R
###############################################
    
    my $R_templ = '

matrix(scan("INNAME"),NSNPS,NSTUDIES,byrow=T)-> p5m

xdf <- function(x){ 
       1-pchisq (sum(qchisq(1-x,1),na.rm=T),length(na.omit(x))) 
   }

apply(p5m, 1, xdf)-> xdf_result
write.table(xdf_result,"OUTNAME",col.names=F,row.names=F)
';
    
    
    $R_templ =~ s/INNAME/$out_name.p_str/g ;
    $R_templ =~ s/OUTNAME/$out_name.xdf/g ;
    $R_templ =~ s/NSTUDIES/$fcount/g ;
    $R_templ =~ s/NSNPS/$nxdf/g ;
    
    
    &a2file ( "R_$out_name.xdf" , $R_templ);
    &mysystem("$r_sys < R_$out_name.xdf --vanilla ");
    
    
    
    
    
################################################
### XDF out
###############################################
    
    
    die "$out_name.xdf: $!" unless open XDF, "< $out_name.xdf"; 
    
    foreach (keys %p_str) {
	my $line = <XDF>;
	chomp($line);
	$pxdf{$_} = $line; 
    }
    close XDF;
    
}

#exit;

################################################
### read combined meta, write new daner_file
###############################################

#MarkerName  Allele1  Allele2  Effect   StdErr   P-value  Direction  HetISq  HetChiSq  HetDf  HetPVal

die "$out_name.meta: $!" unless open META, "< $out_name.meta1.tbl"; 
die "$out_name.metadaner.tmp: $!" unless open DANER, "> $out_name.metadaner.tmp"; 
my $header = <META>;
print DANER "CHR\tSNP\tBP\tA1\tA2\tFRQ_A_$nca\tFRQ_U_$nco\tINFO\tOR\tSE\tP\tngt\tDirection\tHetISqt\tHetChiSq\tHetDf\tHetPVa";

if ($detout) {
    foreach my $dfile (@ARGV) {
	print DANER "\tP-$dfile\tOR-$dfile\tSE-$dfile";
    }
}
print DANER "\n";

#print DANER "CHR\tSNP\tBP\tA1\tA2\tFRQ_A_$nca\tFRQ_U_$nco\tINFO\tOR\tSE\tP\tngt\tDirection\tHetISqt\tHetChiSq\tHetDf\tHetPVa\txdfPVa\n";

while (<META>){
    chomp;
    my @cells = &split_line($_);
#    my $snp_name = $cells[2];
    my $snp_name = $cells[0];
    my $a1_name = $cells[1];
    my $a2_name = $cells[2];
    my $effect = $cells[3];


    unless ($a1_name =~ m/\+[0-9]+a/ || $a1_name =~ m/\+[0-9]+b/) {    
	$a1_name = uc($a1_name);
    }
    else {
	my @st = split '\+', $a1_name;
	$st[0] = uc($st[0]);
	$a1_name = $st[0]."+".$st[1];
    }
    unless ($a2_name =~ m/\+[0-9]+a/ || $a2_name =~ m/\+[0-9]+b/) {    
	$a2_name = uc($a2_name);
    }
    else {
	my @st = split '\+', $a2_name;
	$st[0] = uc($st[0]);
	$a2_name = $st[0]."+".$st[1];
    }
#    $a2_name = uc($a2_name);


    if ($a1_name ne $a1{$snp_name} || $a2_name ne $a2{$snp_name}){
	if ($a1_name eq $a2{$snp_name} && $a2_name eq $a1{$snp_name}) {
	    $effect = $effect * (-1);
	}
	else {
	    print "allelenames at $snp_name changed, $a1_name, $a1{$snp_name}, $a2_name, $a2{$snp_name} -> ERROR\n";
	    die;
	}
    }

#    if ($cells[12] <= 0 || $cells[12] > 1) {
#	die "unvalid p $cells[12] at $snp_name";
 #   }

    die "$snp_name" unless (exists $chr{$snp_name});

    my $out_row = "";

    $out_row .= $chr{$snp_name};
    $out_row .= "\t".$snp_name;
    $out_row .= "\t".$pos{$snp_name};
    $out_row .= "\t".$a1{$snp_name};
    $out_row .= "\t".$a2{$snp_name};

#    printf DANER "\t%.3g",$fca{$snp_name}/$nca;
#    printf DANER "\t%.3g",$fco{$snp_name}/$nco;
#    printf DANER "\t%.3g",$info{$snp_name}/$nco;
    if ($nca{$snp_name} == 0) {
	print "***warning: no meta case-number for $snp_name\n";
	$nca{$snp_name} = 1;
	$fca{$snp_name} = 0;
    }
    if ($nco{$snp_name} == 0) {
	print "***warning: no meta control-number for $snp_name\n";
	$nco{$snp_name} = 1;
	$fco{$snp_name} = 0;
	$info{$snp_name} = 0;
    }

#    die "$nca ($snp_name); fca: $fca{$snp_name}, fco $fca{$snp_name}" if ($nca{$snp_name} == 0);
#    die "$nco ($snp_name)" if ($nco{$snp_name} == 0);

    $out_row .= sprintf  "\t%.3g",$fca{$snp_name}/$nca{$snp_name};
#    print "fca:".$fca{$snp_name};
#    print "\tnca:".$nca{$snp_name};
#    print "\n";
    $out_row .= sprintf  "\t%.3g",$fco{$snp_name}/$nco{$snp_name};
    $out_row .= sprintf  "\t%.3g",$info{$snp_name}/$nco{$snp_name};


    $out_row .= sprintf  "\t%.5f",exp($effect);
    $out_row .= sprintf  "\t%.4g",$cells[4];
    if ($xdf){
	$out_row .= sprintf  "\t%.3g",$pxdf{$snp_name};
    }
    else {
	$out_row .= sprintf  "\t%.4g",$cells[5];
    }
    if (exists $ngt{$snp_name}) {
	$out_row .= "\t".$ngt{$snp_name};
    }
    else {
	$out_row .= "\t-";
    }
    for my $i (6..10) {
	$out_row .= "\t".$cells[$i];
    }
#    $out_row .= sprintf  "\t%.3g",$pxdf{$snp_name};
#    $out_row .= "\t".$nmet{$snp_name};


    if ($detout){
	$out_row .= "\t".$det_str{$snp_name};
    }
    $out_row .= "\n";
    print DANER "$out_row";


}
close META;
close DANER;
&mysystem ("gzip -f $out_name.metadaner.tmp");
&mysystem ("mv $out_name.metadaner.tmp.gz $out_name.metadaner.gz");

