#!/usr/bin/env perl
use strict;
#use warnings;

srand(0);

#############################
# load utility functions
#############################

use FindBin;
use lib "$FindBin::Bin";
use Ricopili::Utils qw(trans);
use Ricopili::Version;

#############################
# read config file
#############################


my $p2loc = &trans("p2loc");
my $qloc = &trans("queue");
#my $hmloc = &trans("hmloc");
my $email = &trans("email");
my $loloc = &trans("loloc");
my $bcmd = &trans("batch_jobcommand");



#######################################


my $serial = 0;


if ($bcmd eq "SERIAL") {
    $serial = 1;
    print "-----------------------------------------------------\n";
    print "switched on SERIAL mode because of configuration file\n";
}



$rp_header =~ s/MODULE/clump_nav3   /;
print "$rp_header\n";






###################################
# variables
####################################

my $cp1 = 0.0001;
my $cp2 = 0.0001;
my $cr2 = 0.25;
my $hq_freq = .02;
my $hq_info = .90;
my $qrat_th = .5;
my $cwindow = 500;

my $detformat = 1;

my $gene_range = 100;

my $version = "1.0.0";
my $progname = $0;
$progname =~ s!^.*/!!;
my $command_line = "$progname @ARGV";

my $rootdir = "";
my $info_txt = "";





my $outname ="";
my $pfile ="";
my $walltime = 4;
my $job_bn_th = 1000;
my $popname = "CEU";
my $sepa = 1;



use Getopt::Long;
GetOptions( "outname=s"=> \$outname,
	    "help"=> \my $help,
	    "clu_p1=f"=> \$cp1,
	    "clu_p2=f"=> \$cp2,
	    "clu_r2=f"=> \$cr2,
	    "hq_f=f"=> \$hq_freq,
	    "hq_i=f"=> \$hq_info,
	    "qrat=f"=> \$qrat_th,
	    "pfile=s"=> \$pfile,
	    "popname=s"=> \$popname,

	    
	    "clu_window=i"=> \$cwindow,
	    "grange=i"=> \$gene_range,
	    "refdir=s"=> \my $refdir,
	    "reinvo=s"=> \my $reinvo_file,
	    "prekno=s"=> \my $prekno_file,
	    "noindel"=> \my $noindel,

	    "det=i"=> \$detformat,

	    "force1"=> \my $force1,
	    "serial"=> \my $serial_sw,
	    "sepa=i"=> \$sepa,

	    "noclean"=> \my $noclean,
	    "bn_job=i"=> \$job_bn_th,
	    "debug"=> \my $debug,
	    
    );


if ($serial_sw) {
    $serial = 1;
}

if ($help || $outname eq ""){
    print "usage: $progname 

version: $version

      options:

        --refdir       reference dir, only one bed/bim/fam per chromosome in there, please 

        --out  STRING   outname
        --help          print this message then quit
        --clu_p1 FLOAT  p1 for clumping, default $cp1
        --clu_p2 FLOAT  p2 for clumping, default $cp2
        --clu_r2 FLOAT  r2 for clumping, default $cr2
        --clu_window INT window size, default $cwindow

        --det INT       detail format, default = $detformat
                          2: with R2 and genes

        --hq_f FLOAT    frequency cutoff, default $hq_freq
        --hq_i FLOAT    info cutoff, default $hq_info
        --qrat FLOAT    max ratio of question marks in the direction column
        --pfile STRING   pfile, containing SNP and P column header

        --prekno STRING  prekno_file


        --reinvo STRING reinvoke script
        --noindel       only SNPs with A,C,G,T as alleles are allowed

        --grange INT    for clumping in kb

        --noclean       do not remove temp subdir

  --force1           do not exit if same fail, but do this only once


  --serial          no sending jobs to queue all in one run
                           -> usually only used for testing 
  --sepa INT        use INT number of parallel jobs during serial


  --bn_job INT        submit INT jobs at a time

  --popname STRING    population for clumping, right now: eur, eas, sas, amr, afr

  --debug             extended output


 --outname is mandatory


example of prekno: 
SNP        Chr    start   end     Pmeta     meta
rs6426833   1     19.93   20.18   3.93e-35  anderson
rs11209026  1     67.30   67.54   5.12e-28  anderson
rs1801274   1     159.54  159.91  2.16e-20  anderson



 created by Stephan Ripke 2011 at MGH, Boston, MA
 in the frame of the PGC
\n";
    exit 2;
}


$popname = uc $popname;


my $oub_txt = '

We have used the dataset to carry out an additional test of the score analysis method described by the International Schizophrenia Consortium (ISC) in detail in their Supplementary Information file (REFERENCE).  The ISC used this method to test if aggregate effects of common SNPs could be replicated across GWAS datasets, and interpreted their  results as supporting the hypothesis that many common SNPs with small effects on risk contribute to schizophrenia risk (polygenic inheritance).     Briefly, this method involves using the association test results (log of the odds ratio) for   each of a set of SNPs from a training dataset to form quantitative scores whose ability to predict   case-control status in a test dataset is then evaluated:    
 
(1) Common SNPs in approximate linkage equilibrium were selected.  In the ISC report,   ~74,000 SNPs were selected which had MAF>2%, high call rate (>0.99) and had no   pairwise r2 value (LD) >0.25 in any 200-SNP sliding window.  Here, because our Stage 1   GWAS were performed on diverse genotyping platforms, we decided to perform   analyses in a slightly altered procedure: we used only SNPs with MAF>10%, then we   restricted to SNPs with very high imputation quality (info R2 score >0.9). This SNP set   underwent a p-value informed LD clumping procedure with the above parameters   (pairwise R2 <0.1, 500 SNP window). As opposed to the p-value blind LD pruning   procedure from the ISC, we could here examine SNP groups with far lower p-values.   Because of the complicated LD structure and the widespread association signal in the   MHC, we decided to exclude this region (chr.6, 25Mb-35Mb) and we were left with 85K   LD independent SNPs.    
(2) In the training dataset, association tests are computed for each SNP (correcting for   ancestry-based covariates) and expressed as the log of the odds ratio for a test allele.     
(3) Several sets of quantitative scores are then computed for each case and control in   the test dataset, based on the pT (p-value threshold) proportion of SNPs with p-values in   the training dataset -- here, we varied pT from 0.0001 to 1.0.    
(4) For each set of SNPs as defined by pT, the score for each subject in the test dataset   is computed as the sum (across all selected SNPs) of the individual’s dosage of the test   allele multiplied by the training dataset log(OR) for that allele.    
(5) For each SNP set two outcome variables are reported: 1) The significance of the   case-control score difference was analyzed by standard logistic regression in R (REFERENCE), including ancestry based principal component scores and a study indicator as   covariates. 2) The proportion of variance explained (R2) was computed by subtracting   the Nagelkerke’s R2 attributable to ancestry covariates alone from the R2 for polygenic   scores plus covariates. The latter analysis required the package Design (REFERENCE).    The rationale for this approach is that it is possible that many SNPs make small   contributions to risk, with ORs ranging from those which are detectable in very large samples   (e.g., ~1.1 as reported for schizophrenia in our mega-analysis) to very small ORs which could   not be detected singly in any feasible sample.  Thus, some of these latter SNPs would produce very small ORs in the training dataset, but all of these SNPs would contribute to the ability of the   quantitative score to predict risk in other datasets.    In the ISC report, the ISC sample was used as the training dataset, and the MGS and   Cardiff samples as test datasets.  P-values for prediction of disease status were 2×10-28 in the   MGS sample and 5×10-11 in the smaller Cardiff sample, with approximately 2.3-3% of the   variance explained.  The ISC supplementary file reported simulation studies demonstrating that   the observed patterns of results for schizophrenia were consistent only with results from models   that included large numbers of common SNPs each with very small effects on risk, with the   models differing primarily in the distribution of these effects across those SNPs.  A range of   models for multiple rare variants did not produce results consistent with the actual data.    Numerous sources of possible confounders were studied, and it was concluded that the   observed results were most likely due to a polygenic contribution of multiple common SNPs,  each with small effects, to schizophrenia risk.   

Here we report on a new score analysis in we used INSERT as the training dataset, consisting of   XXX cases and XXX controls. All other Stage 1 samples were combined into a test   dataset, consisting of   XXX cases and XXX controls.   

 For each individual in the target sample, we weighted its individual post- imputation dosage by the log odds ratio from the discovery sample, building SNP collections   with p-value thresholds of p<0.0001, p<0.001, p<0.01, p<0.05, p<0.1, p<0.2, p<0.3, p<0.4, p<0.5, and p≤1.0 (i.e., all SNPs).    

We used PLINK’s --score function to calculate scores, described at this URL:   pngu.mgh.harvard.edu/~purcell/plink/profile.shtml. To account for population stratification, the   training analysis was performed in the usual logistic regression framework, including study   indicator and significant multi-dimensional scaling (MDS) scores. 
In the target sample, we   estimated the variance explained in disease state by the difference in the Nagelkerke pseudo r2  (REFERENCE) of an analysis including the score and covariates such as site and ancestry principal   component scores vs. an analysis with the covariates alone.    Table XXX shows R2 and P values for each pT value. 

If we saw a biggers in controls compared to cases we gave the R2-difference a negative sign to illustrate that negative correlation.

In summary all R2 values were oscillating around the zero and could not reach nominally significant values.


';

die "please specify refdir" unless ($refdir);

###################################################
###  system call with test if successfull
###################################################

sub mysystem(){
    my ($systemstr)="@_";
    system($systemstr);
    my $status = ($? >> 8);
    die "$systemstr\n->system call failed: $status" if ($status != 0);
}



#####################################
# print array to file
####################################

sub a2file {
    my ($file, @lines)=@_;
    die $! unless open FILE, "> $file";
    foreach (@lines){
	print FILE $_;
    }
    close FILE;
}



##########################################
# split a plink-output-line
##########################################

sub split_line {
    my ($line)=@_;
    chomp($line);
    $line =~ s/^[\s]+//g;
    my @cols=  split /\s+/, $line;
}


##########################################
# subroutine to split a plink-output-line with references
##########################################

sub split_line_ref {
    my ($line)=${$_[0]};
    chomp($line);
    $line =~ s/^[\s]+//g;
    my @cols=  split /\s+/, $line;
    \@cols;
}


#####################################
# append array to file with newline
####################################

sub a2filenew_app {
    my ($file, @lines)=@_;
    die "$!: $file" unless open FILE, ">> $file";
    foreach (@lines){
	print FILE "$_\n";
    }
    close FILE;
}


#####################################
# print array to file with newline
####################################

sub a2filenew {
    my ($file, @lines)=@_;
    die $! unless open FILE, "> $file";
    foreach (@lines){
	print FILE "$_\n";
    }
    close FILE;
}





############################################################
## testing binaries
##############################################################





my @test_scripts;



my $mutt_script = "mutt";
my $blueprint_script = "blueprint";            ### my.pipeline_tar


print ".......testing email program....\n" if ($debug);

my $err_scr = 0;
my $noti = 1;
{
    my $scr_path = '';
    
    for my $path ( split /:/, $ENV{PATH} ) {
	if ( -f "$path/$mutt_script" && -x _ ) {
	    print "$mutt_script\tfound in $path\n" if ($debug);
	    $scr_path = "$path/$mutt_script";
	    last;
	}
    }
    unless ( $scr_path ) {

	print "!!Warning!! : No $mutt_script command available, trying mail\n"  if ($debug);

	$mutt_script = "mail";
	for my $path ( split /:/, $ENV{PATH} ) {
	    if ( -f "$path/$mutt_script" && -x _ ) {
		print "$mutt_script\tfound in $path\n" if ($debug);
		$scr_path = "$path/$mutt_script";
		last;
	    }
	}
	unless ( $scr_path ) {
	    print "!!Warning!! : No $mutt_script command available, no email notifications\n" ;
	    $noti = 0;

	}
    }
 
}
die if $err_scr == 1;









#####################################
# send jobs to cluster and also send navi again
#####################################

my $sjadir = "";
my $sjaweek = 0;
my $sjaname = "";
my $sjarow = "";
my @sjaarray;
my $sjamem = 0;
my $sjatime = -1;
my $sjamaxjobs = 30000;

my $sjainfofile = "$loloc/clumper_info";
unless (-e $sjainfofile) {
    print "log-file ($sjainfofile) is not existing\n";
    print "please check loloc in ~/ricopili.conf\n";
    exit;
}
my $sjainfotxt = "";
my $sjamulti = 0;


sub send_jobarray {

    die "send_jobarray with undefined variables, dir" if ($sjadir eq "");
    die "send_jobarray with undefined variables, name" if ($sjaname eq "");
    die "send_jobarray with undefined variables, array" if (@sjaarray == 0);
    die "send_jobarray with undefined variables, mem" if ($sjamem == 0);
    die "send_jobarray with undefined variables, time" if ($sjatime < 0);
    die "send_jobarray with undefined variables, info" if ($sjainfotxt eq "");



    my $now = localtime time;
    $now =~ s/ /_/g;


    if ($sjaname eq "finished") {

	my $fini_message ;
	$fini_message .= "\n\n##################################################################\n";
	$fini_message .= "##### CONGRATULATIONS: \n";
	$fini_message .= "##### rp_pipeline_clump finished successfully:\n";
	$fini_message .= "##### $sjainfotxt\n";
	$fini_message .= "##### have a look at the distributions subdir for output files\n";
	$fini_message .= "##### have a look at the wiki page for more details\n"; 
	$fini_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n";
	$fini_message .= "##################################################################\n";
	print "$fini_message\n";

	
	die $! unless open SUC, "> success_file";
	print SUC $fini_message."\n";
	close SUC;

	
	if ($noti == 1) {
	    &mysystem ('cat success_file | '.$mutt_script.' -s RP_clump_finished '.$email) ;
	}

	my $sjarow      = $sjainfotxt."\t$sjaname\t$now";
	&a2filenew_app("$sjainfofile",$sjarow);


	exit;

    }



    chdir ($sjadir);
    my $jobfile = "$sjaname.job_list";
    while (-e $jobfile) {
	$jobfile .= ".s";
    }
    &a2filenew ($jobfile, @sjaarray);

    $walltime = $sjatime;
    my $nsja = @sjaarray;

    my $nsja_loc = $nsja;
    if ($nsja_loc > 30000) {
	$nsja_loc = 30000;
    }

    my $multi_txt = "";
    if ($sjamulti > 0) {
	$multi_txt = "--multi $nsja_loc,$sjamulti";
    }

    ### with array
    my $sja_week_str = "";
    if ($sjaweek > 0) {
	$sja_week_str = "--week 1";
    }



    my $old_cmd = `tail -1 $sjainfofile | head -1`;
    my $nsja_txt = sprintf "%06d",$nsja;
    my $sjacontent = "$sjaname.".$nsja_txt;
    my $sjarow_part = $sjainfotxt."\t$sjacontent";
    my $sjarow      = $sjainfotxt."\t$sjacontent\t$now";
#    $message = $info_txt."\t$message\t$now";

    &a2filenew_app("$sjainfofile",$sjarow);

    if ($old_cmd =~ /$sjarow_part/) {


	unless ($force1 ){
	    my $err_message ;
	    $err_message .= "##################################################################\n";
	    $err_message .= "##### Error: \n";
	    $err_message .= "##### step $sjaname has been done repeatedly without any progress\n";

	    $err_message .= "##### clumper pipeline stopped\n";
	    $err_message .= "##### $sjainfotxt\n";
	    $err_message .= "##### if reason does not appear obvious\n";
	    $err_message .= "##### have a look at the wiki page\n"; 
	    $err_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n";
	    $err_message .= "##### or contact the developers\n";
	    $err_message .= "##################################################################\n";
	    print "$err_message\n";

	    die $! unless open ERR, "> error_file";
	    print ERR $err_message."\n";
	    close ERR;


	    if ($noti == 1) {
		&mysystem ('cat error_file | '.$mutt_script.' -s RP_clump_error '.$email) ;
	    }
#	    unless ($serial) {
		exit;
#	    }
	}



    }


    
    #################################
    ## starting the job array
    ##################################
    if ($serial) {


	print "starting step $sjaname with ".@sjaarray." jobs\n" if ($debug);
	print "running up to $sepa parallel jobs.\n" if ($debug);


	my $jc = 1;
#	my $job_str = "";
	my @job_sepa_arr;

	foreach (@sjaarray) {
	    print "running job $jc...\n" if ($debug);
	    push @job_sepa_arr, "$_ &";
#	    $job_str .= "$_ & \n";

	    if ($jc % $sepa == 0) {
		push @job_sepa_arr, "wait";
		#		$job_str .= "wait\n";
		my $sepa_file = "$sjaname.sepa.$jc";
		&a2filenew ($sepa_file,@job_sepa_arr);
		print "sepa_file: ".$sepa_file."\n" if ($debug);
		&mysystem("chmod u+x $sepa_file");
		&mysystem("./$sepa_file");
		@job_sepa_arr = ();
	    }
	    $jc++;
	    
	}

	if (@job_sepa_arr > 0) {
	    $jc--;
	    push @job_sepa_arr, "wait";
	    
	    my $sepa_file = "$sjaname.sepa.$jc";
	    &a2filenew ($sepa_file,@job_sepa_arr);
	    print "sepa_file: ".$sepa_file."\n" if ($debug);
	    &mysystem("chmod u+x $sepa_file");
	    &mysystem("./$sepa_file");
	}



    }
    else { 
	my $sys_loc = "$blueprint_script $sja_week_str --noerr --njob $nsja_loc --array $jobfile --wa $sjatime --mem $sjamem --j --na $sjaname.$outname $multi_txt";
	&mysystem ($sys_loc);
    }




    
    $command_line =~ s/--force1//;



    my $wt_file = "$sjadir/j.$sjaname.$outname.id";

#    my $wt_file = "$sjadir/j.$jobfile.id";
    chdir "$rootdir" or die "something strange";
    
    if ($serial) {
	my $sys_re = "$command_line";
	&mysystem ($sys_re);
	exit;
    }
    else {
	my $sys_re = "$blueprint_script --njob $job_bn_th -b \"$command_line\" --wa 1 --di -j --fwt $wt_file --na _cn_$outname";
	&mysystem ($sys_re);
    }


    print "------------------------------------------------------------\n";
    print "$nsja jobs successfully submitted\n";
    print "please see tail of $sjainfofile for regular updates\n";
    print "also check bjobs -w for running jobs\n";
    print "possibly different command on different computer cluster: e.g. qstat -u USER\n";
    print "you will be informed via email if errors or successes occur\n";
    print "------------------------------------------------------------\n";


    exit;


}









#####################################
# BEGIN
#####################################



use File::Copy;
use File::Path;
use Cwd;
use lib $ENV{rp_perlpackages};
use Compress::Zlib ;






$rootdir = &Cwd::cwd();
$info_txt = "command:\t\"$command_line\"\tdir:\t$rootdir";

$sjainfotxt = "$rootdir\t$command_line";

my $chr_subdir = "chr_subdir.$outname";
my @created = mkpath(   ## $created ?
			"$chr_subdir",
			{verbose => 0, mode => 0750},
    );

#####################################
# split pfile
#####################################


if (-e "$outname.clumped.xmhc.gz") {
    print "output files is already existing\n" if ($debug);
#    exit;
}

my %ccount ;


print "splitting daner files in to chromosomes...\n" if ($debug);


#if ($pfile =~ /.gz$/){
#    my $pfile_tmp = $pfile;
#    $pfile_tmp =~ s/.gz//;
#    print "$pfile_tmp\n";
#    &mysystem("gunzip -c $pfile > $pfile_tmp") unless (-e $pfile_tmp);
#    $pfile =~ s/.gz//;
#}


my $inz = gzopen("$pfile", "rb")  or die "Cannot open $pfile\n" ;

#@fd = (*STDIN, *STDOUT, *STDERR); 
my @filehandles;
push @filehandles, "nichts";
foreach my $chr (1..22) {
    my $out_loc = "$chr_subdir/$outname.chr$chr.txt";
    if (-e "$out_loc.n"){
	$ccount{$chr} = `cat $out_loc.n`;
	chomp($ccount{$chr});
	print "found $ccount{$chr} SNPs for chromosome $chr\n" if ($debug);
	next ;
    }
    local *FILE;
    open(FILE, "> $out_loc") || die "$out_loc not to open";
    push(@filehandles, *FILE);
}

#exit;

my $pvcol = 10;
my $chrcol = 0;
my $poscol = 2;
my $snpcol = 1;
my $a1col = 3;
my $a2col = 4;
my $dircol = 12;
my $orcol = 8;
my $fcol = 6;
my $icol = 7;
my $mhc_out ;
my $mhc_p = 1;



if (@filehandles > 1) {

    $inz->gzreadline(my $line);

#    die $! unless open PFI, "< $pfile";
#    my $line = <PFI>;
    my @cells = @{&split_line_ref(\$line)};
    my $pchr = $cells[$chrcol];
    my $pp = $cells[$pvcol];
    my $ps = $cells[$snpcol];
    my $a1 = $cells[$a1col];
    my $a2 = $cells[$a2col];
    my $or = $cells[$orcol];
    my $frq = $cells[$fcol];
    my $info = $cells[$icol];
    foreach my $chr (1..22) {
	my $file_loc = $filehandles[$chr];
#	print $file_loc "$ps\t$a1\t$or\t$pp\t$info\t$frq\n";
	print $file_loc "@cells\n";
    }

    while ($inz->gzreadline($line)){

#    while (my $line = <PFI>){
	my @cells = @{&split_line_ref(\$line)};
	my $pchr = $cells[$chrcol];
	my $ppos = $cells[$poscol];
	my $freq = $cells[$fcol];
	my $info = $cells[$icol];

	next if ($pchr > 22 || $pchr < 1);
	next if ($freq < $hq_freq || $freq > 1 - $hq_freq);
	next if ($info < $hq_info);
	    




	my $pp = $cells[$pvcol];
	my $ps = $cells[$snpcol];
	my $a1 = $cells[$a1col];
	my $a2 = $cells[$a2col];
	my $or = $cells[$orcol];
	my $file_loc = $filehandles[$pchr];


	if ($noindel) {
	    next if (length($a1) != 1);
	    next if (length($a2) != 1);
	    next unless ($a1 =~ /[ACGTacgt]/);
	    next unless ($a2 =~ /[ACGTacgt]/);
	}


	my $q_str = ".";
	if (@cells > 12) {
	    $q_str = $cells[$dircol];
	}
	my $numq = 0;
	$numq = $q_str =~ tr/?//;
	my $numd = length($q_str);
	my $qr = $numq / $numd;
	next if ($qr > $qrat_th);



	
	## only one per MHC
	if ($pchr == 6) {
	    if ($ppos > 25000000 && $ppos < 35000000) {
		if ($pp < $mhc_p) {
		    $mhc_p = $pp;

#		    $mhc_out = "$ps\t$a1\t$or\t$pp";
		    $mhc_out = "@cells\n";
#		    print "$mhc_out\t$ppos\n";
		}
		next;
	    }
	}



	
	

#	print $file_loc "$ps\t$a1\t$or\t$pp\t$info\t$freq\n";
	print $file_loc "@cells\n";

	$ccount{$pchr}++ ;

#	print $file_loc "$ps\t$pp\n";
#	print "$pchr\n";
    }
    my $file_loc_chr6 = $filehandles[6];
    print $file_loc_chr6 "$mhc_out\n";
    
    foreach my $chr (1..22) {
	my $file_loc = $filehandles[$chr];	
	close $file_loc;
	
	die $! unless open OUT, "> $chr_subdir/$outname.chr$chr.txt.n";
	print OUT "$ccount{$chr}\n";
	close OUT;
    }





}
$inz -> gzclose();
print "split successful\n\n" if ($debug);

#exit;


# my.daner2chr --chr 2 --daner daner_PGC_SCZ52_0513a.r1000.gz --out daner_PGC_SCZ52_0513a.r1000.chr2.gz



my $gene_list_0610_plink ;
if ($detformat == 2) {
    $gene_list_0610_plink = "$refdir/plink.refGene_0111_hg19.txt";
    unless (-e $gene_list_0610_plink){
	$gene_list_0610_plink = "$refdir/plink.refGene_0610.txt";
    }
    die "no $gene_list_0610_plink" unless (-e $gene_list_0610_plink);
}


#####################################
# clump all single chromosomes
#####################################



#my @refbfiles = ();
#opendir(DIR, "$refdir") || die "can't opendir .: $!";
#@refbfiles = readdir(DIR);
#closedir DIR;
#my @refbfiles = grep {/bim$/} @refbfiles;



my @job_arr = ();
my @clump_files;

 
#print "hier: $outname.pliclu.done\n";




#unless (-e "$outname.pliclu.done"){

   

    foreach my $chr (1..22) {
	
#	print "count.$chr: ".$ccount{$chr}."\n";
	

	next if ($ccount{$chr} <= 1);
	
	my $chr_loc = "chr$chr.";
	
	my $in_loc = "$chr_subdir/$outname.chr$chr.txt";


	
	my $bfile_sc = "";

	

	if (-e "$refdir/reference_templ") {
	    my $out_template = `grep bfile_template $refdir/reference_templ`;
	    $out_template =~ s/bfile_template[ ]+//;
	    chomp($out_template);
	    #		print "out: $out_template\n";
	    $bfile_sc = "$refdir/$out_template";
	    $bfile_sc =~ s/XXX/$chr/;
#	    $bfile_sc .= ".impute.plink.$popname.bed";
	    #		print "bf: $bfile_sc\n";
	    
	}
	else {
	    $bfile_sc = `ls $refdir/*chr$chr.*bed`;
	    if ($bfile_sc eq "") {
		my $chr_part = "$chr"."_";
		$bfile_sc = `ls $refdir/*chr$chr_part*bed`;
	    }
	    if ($bfile_sc eq "") {
		print "Error: no plink binary for this chromosome";
		exit;
	    }
	}

	chomp($bfile_sc);
	$bfile_sc =~ s/.bed$//;
	    
	    
	my $outname_loc = "$chr_subdir/$outname.chr$chr";
	    
	my $cmd;
	    
	$cmd = "$p2loc/plink --silent --memory 2000 --bfile $bfile_sc --clump $in_loc --clump-p1 $cp1 --clump-p2 $cp2 --clump-r2 $cr2 --clump-kb $cwindow --out $outname_loc";
	    
	if ($detformat == 2) {
	    $cmd = "$p2loc/plink --silent --memory 2000 --bfile $bfile_sc --clump-range $gene_list_0610_plink --clump-verbose --clump-range-border $gene_range --clump $in_loc --clump-p1 $cp1 --clump-p2 $cp2 --clump-r2 $cr2 --clump-kb $cwindow --out $outname_loc";
	}
	
	my $out_loc = "$outname_loc.clumped";
	my $log_loc = "$outname_loc.log";

	my $rerun = 1;
	if (-e $log_loc) {	    
	    my $log_loc_out = `tail -1 $log_loc`;
	    
	    if ($log_loc_out =~ "^End time") {
		$rerun = 0;
	    }
#	    else {
#		$rerun = 1;
#	    }
	}
	unless (-e $out_loc) {
	    $rerun = 1;
	}
	#    print "$cmd\n";
	    #    print "$chr\t$cr2\t$out_loc\n";
	push @job_arr, $cmd if ($rerun == 1);
	push @clump_files, $out_loc if ($rerun == 0);
#	print "$log_loc, $rerun\n"
    }
    #    print "$cmd\n";
    
    

#}
#exit;

if (@job_arr > 0 ) {
#    print "debug:\n";
#    print "$job_arr[0]\n";
#    exit;
    print "now clumping\n\n" if ($debug);    
    
    $sjadir = $rootdir;
    $sjaname = "clump";
    $sjatime = 2;
    $sjamem = 2000;
    @sjaarray = @job_arr;
    
    &send_jobarray;
    
    
    
    #    &mysystem ("cat joblist_pliclu | blueprint -b \"prefix\" --mem 1000 --wa 4 -i 8,7 -j --na pliclu_$out");
    #    &reinvo_b ("pliclu-$njobs","blueprint_joblist_file-pliclu_$out");
    
}
    
unless (-e "$outname.cfiles") {
    die $! unless open OUT, "> $outname.cfiles";
    foreach my $cf (@clump_files) {
	print OUT "$cf\n";
    }
    close OUT;
}
    
#    &mysystem ("touch $outname.pliclu.done");

print "clump successful\n\n" if ($debug);


#exit;


#####################################
# read out the original file 
#####################################




if (@clump_files == 0){
    print "read file: $outname.cfiles\n" if ($debug);
    die $! unless open LDI, "< $outname.cfiles";
    while (my $line = <LDI>){
	chomp($line);
	push @clump_files, $line;
    }
    close LDI;
}

if (@clump_files == 0){
    print "Error: no clumped files left\n";
    exit;
}

#print "clumfiles: @clump_files\n";
#exit;

#####################################
# sum up the clumps
#####################################
# chr_subdir/test_ace1.chr22.chr22_030_033_1000.clumped

unless (-e "$outname.sum.clumped"){

    die $! unless open OUT, "> $outname.sum.clumped.tmp";
    my $head_sw = 0;
    foreach my $cf (@clump_files) {
	
	
	die "$cf".$! unless open CF, "< $cf";
	my $line = <CF>;
	if ($head_sw == 0) {
	    print OUT $line;
	    $head_sw = 1;
	}
	while (my $line = <CF>){
	    #		chomp($line);
	    print OUT $line;
	}
	close CF;
	
    }
    close OUT;
    &mysystem ("mv $outname.sum.clumped.tmp $outname.sum.clumped") ;
    
}


#exit;



#####################################
# read out the original file 
#####################################

if ($detformat == 1) { 
    unless (-e "$outname.clumped"){
#	print "create $outname.$pfile.chr$chr.clumped ";
#	print "out of $outname.chr$chr.clumped and  $pfile.chr$chr \n";
	print "read $outname.sum.clumped\n" if ($debug);
	my %snp_coll = ();
	die $! unless open LDI, "< $outname.sum.clumped";
	while (my $line = <LDI>){
	    chomp($line);
	    my @cells = @{&split_line_ref(\$line)};
	    next if ($cells[0] eq "CHR");
	    my $snp_loc = $cells[2];
	    $snp_coll{$snp_loc} = 1;
	}
	close LDI;
	
	
	print "change $pfile\n" if ($debug);
	
	my $inz = gzopen("$pfile", "rb")  or die "Cannot open $pfile\n" ;
	
#	die $! unless open LDI, "< $pfile";
#	die $! unless open LDI, "< $pfile.chr$chr";
	die $! unless open LDO, "> $outname.clumped";

	$inz->gzreadline(my $header);
#	my $header = <LDI>;
	print LDO $header;
	while ($inz->gzreadline(my $line)){
#	while (my $line = <LDI>){
	    chomp($line);
	    my @cells = @{&split_line_ref(\$line)};
	    my $snp_loc = $cells[1];
	    if (exists $snp_coll{$snp_loc}){
		print LDO "$line\n";
	    }
	}
	$inz -> gzclose();
	close LDO;
    }
}

elsif ($detformat == 2) { 
    unless (-e "$outname.clumped"){
	print "read $outname.sum.clumped\n" if ($debug);
	my %ind_coll = ();
	my %ind_genes = ();
	$ind_coll{"SNP"} = "LD_friends";
	$ind_genes{"SNP"} = "genes_in_friends_$gene_range"."kb";
	my $index = "";
	my %loc_hash;
	die $! unless open LDI, "< $outname.sum.clumped";
	while (my $line = <LDI>){

	    if ($line =~ /-----/) {
		if ($index ne "") {
		    print "non empty index\n";
		    die ;
		}
	    }

#	    if ($index eq "rs2974312") {
#		print "rs2974312: $line\n";
#	    }

	    my @cells = @{&split_line_ref(\$line)};
	    next if ($cells[0] eq "CHR");
	    $cells[0] = "" if (@cells == 0);

#	    if ($cells[2] eq "rs7015630") {
#		print "found $cells[2]!!!! <$index>\n";
#	    }

	    if ($index eq "") {
#		next unless ($cells[0] eq "(INDEX)");
		next unless (@cells == 11);
		$index = $cells [2];
		%loc_hash = ();
		$ind_coll{$index} = "";
	    }
	    else {

		### get all LD friends in
		if (@cells == 6) {
		    $loc_hash{$cells[0]} = $cells[2];
#		    $ind_coll{$index} .= ",$cells[0]($cells[2])";
		}


		if ($cells[0] eq "GENES:"){

		    $ind_genes{$index} .= "$cells[1]";
		    my $stop = 0;
		    while ($stop == 0) {
			my $line_loc = <LDI>;
			if ($line_loc =~ /-----/) {
			    $stop = 1;
			}
			else {
			    my @cells = @{&split_line_ref(\$line_loc)};
			    $ind_genes{$index} .= ",$cells[0]" if (@cells > 0);
			}
		    }




		    ###############################################
		    ####  sort a hash by value!!!!
		    ################################################
		    my @sorted = sort { $loc_hash{$b} <=> $loc_hash{$a} } keys %loc_hash; 

		    foreach my $friend (@sorted) {

			$ind_coll{$index} .= ",$friend($loc_hash{$friend})";
#			print "$index\t$loc_hash{$index}\n";
		    }
		    $ind_coll{$index} =~ s/^,//;
		    if ($ind_coll{$index} eq "") {
			$ind_coll{$index} = "no_friends";
		    }
		    if ($ind_genes{$index} eq "") {
			$ind_genes{$index} = "no_genes";
		    }

		    $index = "";
		}
	    }

	}
	close LDI;


#	if (0) {	
	print "change $pfile\n" if ($debug);


	print "here is problem with pfile and pfile.chr\n" if ($debug); 
	exit;
	

	my $inz = gzopen("$pfile", "rb")  or die "Cannot open $pfile\n" ;
#	die $! unless open LDI, "< $pfile.chr$chr";
	die $! unless open LDO, "> $outname.clumped";
	while ($inz->gzreadline(my $line)){
#	while (my $line = <LDI>){
	    chomp($line);
	    my @cells = @{&split_line_ref(\$line)};
	    my $snp_loc = $cells[1];
	    if (exists $ind_coll{$snp_loc}){
		die "strange" unless (exists $ind_genes{$snp_loc});
		print LDO "$line\t$ind_coll{$snp_loc}\t$ind_genes{$snp_loc}\n";
	    }
	}
	close LDI;
	close LDO;


	die $! unless open RE, "> $outname.clumped.readme";
	print RE "p1\t".$cp1."\n";
	print RE "p2\t".$cp2."\n";
	print RE "r2\t".$cr2."\n";
	print RE "clu_window\t".$cwindow."\n";
	print RE "freq_filter\t".$hq_freq."\n";
	print RE "info_filter\t".$hq_info."\n";
	print RE "gene_range\t".$gene_range."\n";
	print RE "format\t".$detformat."\n";
	close RE;

#	}
    }
    else {
	print "$outname.clumped already existing\n";
    }

}
else {
    print "print detformat not supported\n";
    exit;
}




#####################################
# clean MHC
#####################################

unless (-e "$outname.clumped.xmhc"){
    print "clean for MHC\n" if ($debug);
    die $! unless open LDI, "< $outname.clumped";
    die $! unless open LDO, "> $outname.clumped.xmhc";
    my $best_mhc = 1;
    my $mhc_row ;
    
    while (my $line = <LDI>){
	chomp($line);
	my @cells = @{&split_line_ref(\$line)};
	if ($cells[0] == 6 && $cells[2] > 25000000 && $cells[2] < 35000000) {
	    if ($cells[10] < $best_mhc){
		$best_mhc = $cells[10];
		$mhc_row = $line;
	    }
	}
	else {
	    print LDO "$line\n";
	}
    }
    print LDO "$mhc_row\n";
    
    close LDI;
    close LDO;

}


#####################################
# prekno
#####################################
die $! unless open PRES, "> $outname.clumped.xmhc.start.prekno";
print PRES "prekno_clump --clump $outname.clumped.xmhc --refdir $refdir  --prekno PREKNOFILE\n";
print PRES "txt2xls --txt $outname.clumped.xmhc.prekno --xls $outname.xmhc.prekno.xls --pcol 10,16,20,22 --cogr 5,12,17,19,23,28,33\n";
close PRES;



if ($prekno_file){
    print "prekno_clump\n" if ($debug);
    my $cmd = "prekno_clump --clump $outname.clumped.xmhc  --prekno $prekno_file --refdir $refdir";
    unless (-e "$outname.clumped.xmhc.prekno") {
	print "$cmd\n" if ($debug);
	&mysystem ($cmd);
    }
    unless (-e "$outname.xmhc.prekno.xls") {
	my $cmd = "txt2xls --txt $outname.clumped.xmhc.prekno --xls $outname.mhc.prekno.xls --pcol 10,16 --cogr 5,8,12,17,18,19";
	&mysystem ($cmd);
    }

}


#####################################
# create excel
#####################################


unless (-e "$outname.clumped.xmhc.p2.xls"){


    die $! unless open LDI, "< $outname.clumped.xmhc";
    die $! unless open LDO, "> $outname.clumped.xmhc.p2";

    
    while (my $line = <LDI>){
	chomp($line);
	my @cells = @{&split_line_ref(\$line)};
	if ($cells[10] < 0.01) {
	    print LDO "$line\n";
	}
    }
    
    close LDI;
    close LDO;


       
    my $cmd = "txt2xls --txt $outname.clumped.xmhc.p2 --xls $outname.clumped.xmhc.p2.xls --pcol 10,16 --cogr 5,8,12,17,18";
    &mysystem ($cmd);

}


&mysystem ("gzip -c $outname.clumped.xmhc > $outname.clumped.xmhc.gz");

if ($reinvo_file) {
    die "$reinvo_file: ".$! unless open FILE, "< $reinvo_file";
    my $redir = <FILE>;
    chomp($redir);
    my $recom = <FILE>;
    chomp($recom);
    close FILE;
    
    chdir ($redir);
    &mysystem ($recom);
    
}




unless ($noclean) {
    print "remove tmp filies, if not wished use --noclean\n" if ($debug);
    &mysystem ("rm -fr $chr_subdir");
    &mysystem ("rm -f $outname.sum.clumped");
    &mysystem ("rm -f $outname.clumped.xmhc.start.prekno");

}

&mysystem ("rm -f $outname.clumped.xmhc.p2");
&mysystem ("rm -f $outname.clumped.xmhc");
&mysystem ("rm -f $outname.clumped");
&mysystem ("rm -f $outname.cfiles");



#############################################################
## SUCCESSSS
#############################################################

$sjadir = $rootdir;
$sjaname = "finished";
push @sjaarray, "tmp";
$sjatime = 2;
$sjamem = 1000;

    
&send_jobarray;







