#!/usr/bin/perl
use strict;
my $version = "1.0.0";
my $progname = $0;
$progname =~ s!^.*/!!;

my $command_line = "$progname @ARGV";


## - http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz

#############################
## log command

my $past_file = $ENV{HOME}."/areator_info";
my $pwd_loc = $ENV{PWD};

die $! unless open FILE, ">> $past_file";
print FILE "$pwd_loc\t$command_line\n";
close FILE;

#############################
# read config file
#############################

my $conf_file = $ENV{HOME}."/ricopili.conf";
my %conf = ();

die $!."($conf_file)" unless open FILE, "< $conf_file";
while (my $line = <FILE>){
    my @cells = split /\s+/, $line;
    $conf{$cells[0]} = $cells[1];
}
close FILE;

sub trans {
    my ($expr)=@_;
    unless (exists $conf{$expr}) {
	die "config file without entry: $expr\n";
    }
    $conf{$expr};
}

my $ploc = &trans("p2loc");
#my $hvloc = &trans("hvloc");
my $hapmap_ref_root = &trans("hmloc");
my $sloc = &trans("sloc");


####################################################


my $pth =4;
my $r2th =.2;
my $window_size =3000;


my $snpcol = 1;
my $chrcol = 2;
my $poscol = 3;
my $qualicol = 4;
my $pcol = 5;
my $escol = 6;
my $secol = 7;
my $a1col = 10;
my $a2col = 10;
my $frecol = 11;
my $fre1col = 11;
my $fre2col = 12;
my $faicol = 8;
my $ngtcol = 9;
my $hmicol = 14;
my $detcol_beg = 14;
my $out_name = "areator_out";




if (1){
    $snpcol = 2;
    $chrcol = 1;
    $poscol = 3;
    $qualicol = 8;
    $pcol = 11;
    $escol = 9;
    $secol = 10;
    $a1col = 4;
    $a2col = 5;
    $frecol = 7;
    $fre1col = 6;
    $fre2col = 7;
    $faicol = 13;
    $ngtcol = 12;
    $hmicol = 12;
    $detcol_beg = 18;
}



my @th_arr = qw /.1 .01 .001 .0001 .00001 .000001 .0000001 .00000005/;


#my $hvloc = "/fg/debakkerscratch/ripke/plink";
#my $hvloc_lisa = "/home/gwas/haploview/Haploview.jar";

##### help message
my $usage = "
Usage : $progname [options] --best dan_out.gz

version: $version

  --help           print this message and exit
  --out STRING     name of outfile
  --cond STRING    name of file describing conditional analysis (directory and comand)
  --1KG            1KG positions
  --pth INT        threshold, default $pth
  --p2th INT        second threshold, only if specifically wished, no log transformation
  --nohead         no header

  --phase2        take HM Phase 2

  --ploc STRING    location (absolute path) of plink, if other than in the path
##  --hvloc STRING   location (absolute path) of Haploview.jar, if other than in the path

  --cols STRING  combined column-string, separated by commas, overwrites other options
                     SNPCOL,PCOL,CHRCOL,POSCOL
  --check        check columns with sample output

  --refdir       reference dir, only one bed/bim/fam per chromosome in there, please 

  --r2th FLOAT   r2-threshold, default $r2th
  --plink-clump  use plink-cluming

  --nogene       no gene annotation
  --window INT   window-size, default = $window_size

  --chr INT      only one chromosome

  --mhc1         only one per MHC

  --Xchr         take chrX

  --prekno STRING pre-known SNPs in first column

  --noxls        helpful if output files to long



 best-file: gzipped best-file
 

 cond-file - example (2nd row without out)
  /home/gwas/pgc-samples/scz/QC1B/imputation
  postimp_report_3 --idin SCZ17b.mds --dos dosages --pca SCZ17b.mds --coco 1,2,3,4,6


  legend:

SNP=name of variant; CHR=chromosome; BP=basepair position (hg19); P=p-value, OR= odds ratio for allele1;
SE=standard error, A1A2=allele1 and allele2; FRQ_A=frequency of allele1 in cases, FRQ_U=frequency of allele1 in controls;
INFO=imputation info score; ngt=number of studies this variant is genotyped (vs. imputed); 
friends(.1).p0.001 = list of all variants with LD-r2 > 0.1 to index SNP, in brackets LD-r2 and distance in kb, sorted by LD-r2;
range.left=left margin of region (defined by LD friends); range.right=right margin of region (defined by LD friends)
span(kb)=right margin - left margin (in kb)
friends(.6).p0.001, range.left.6, range.right.6, span.6(kb)= as before but with LD-r2 of 0.6
gwas_catalog_span.6= list of entries in gwas catalogue amongst the entries in column friends(.6), version April 2013,
genes.6.50kb(dist2index)= list of genes within the region of friends.6 (50kb buffer added on both sides), in brackets distance to index SNP in kb. RefGene, version, April 2013



## gwas catalogue, right now: /home/radon01/sripke/bakker_ripke/hapmap_ref//gwascatalog/0413/gwascatalog.txt.ow.short.new.sorted
## gene list: /home/radon01/sripke/bakker_ripke/hapmap_ref/impute2_ref/1KG_Aug12/ALL_1000G_phase1integrated_v3_impute_macGT1/eur/refGene_0413.txt.out


 created by Stephan Ripke 2009 at MGH, Boston, MA
 
";

my $bfile = "";

use Getopt::Long;
GetOptions( 
    "help"=> \my $help,
    "nohead"=> \my $nohead,
    "1KG"=> \my $my1KG,
    "best=s"=> \my $bestfile,
    "mhc1"=> \my $mhc1,
    "out=s"=> \$out_name,
    "pth=i"=> \$pth,
    "p2th=f"=> \my $p2th_val,
    "window=i"=> \$window_size,
    "cond=s"=> \my $cond_name,
    "phase2"=> \my $phase2,
    "ploc=s"=> \$ploc,
##    "hvloc=s"=> \$hvloc,
    "cols=s"=> \my $colstr,
    "check"=> \my $check,
    "Xchr"=> \my $Xchr,
    "refdir=s"=> \my $refdir,
    "plink-cump"=> \my $pliclu,
    "chr=i"=> \my $onechr,
    "nogene"=> \my $nogene,
    "r2th=f"=> \$r2th,
    "prekno=s"=> \my $prekno_file,
    "noxls"=> \my $noxls,
    );


($snpcol,$pcol,$chrcol,$poscol)= split ',', $colstr if ($colstr);


die "$usage" if ($help);
$out_name = $bestfile if ($out_name eq "areator_out");
die "$usage" unless ($bestfile);
my $pth = 10**(-$pth);
my $pth2 = 10*$pth;
$pth2 = 0.01 if ($pth2 > .01);
if ($pth > .01) {
    print "use clumper for high p-thresholds\n";
}

if ($p2th_val){
    $pth2 = $p2th_val;

}
#print "pth2: $pth2\n";
#sleep(10);

###print "first111111\n";

use Compress::Zlib ;
#use lib '/home/gwas/bin/Spreadsheet-WriteExcel-2.25/blib/lib';
use lib $ENV{rp_perlpackages}.'/Spreadsheet-WriteExcel-2.40/blib/lib';
#use lib '/psych/genetics_data/ripke/references_from_debakkerscratch/perl_modules/Spreadsheet-WriteExcel-2.37/blib/lib';
 use Spreadsheet::WriteExcel;                             # Step 0

#print "secind\n";


##########################################
# subroutine to split a plink-output-line
##########################################

sub split_line {
    my ($line)=@_;
    chomp($line);
    $line =~ s/^[\s]+//g;
    my @cols=  split /\s+/, $line;
}

##########################################
# subroutine to split a plink-output-line with references
##########################################

sub split_line_ref {
    my ($line)=${$_[0]};
    chomp($line);
    $line =~ s/^[\s]+//g;
    my @cols=  split /\s+/, $line;
    \@cols;
}


#####################################
# subroutine to count lines of a file
#####################################

sub count_lines {
    my ($file)=@_;
    my $lc=0;
    die "$file: ".$! unless open FILE, "< $file";
    while (<FILE>){
	$lc++;
    }
    close FILE;
    $lc;
}


#####################################
# print array to file
####################################

sub a2file {
    my ($file, @lines)=@_;
    die $! unless open FILE, "> $file";
    foreach (@lines){
	print FILE $_;
    }
    close FILE;
}


#####################################
# print array to file
####################################

sub a2file_new {
    my ($file, @lines)=@_;
    die $! unless open FILE, "> $file";
    foreach (@lines){
	print FILE $_."\n";
    }
    close FILE;
}

###################################################
###  system call with test if successfull
###################################################
sub mysystem(){
    my ($systemstr)="@_";
    system($systemstr);
    my $status = ($? >> 8);
    die "$systemstr\n->system call failed: $status" if ($status != 0);
}


###################################################
###  system call with test if successfull
###################################################
sub mysystem_nodie(){
    my ($systemstr)="@_";
    system($systemstr);
    my $status = ($? >> 8);
    print "$systemstr\n->system call failed: $status" if ($status != 0);
}

########################################
# test plink 
###################################


unless (-e "$ploc/plink" ){
    print "\n***** Error: couldn't find the following:\n";
    print "$ploc/plink\n";
    exit;
}



########################################
# test haploview
###################################

#unless (-e "$hvloc/Haploview.jar" ){
#    print "\n***** Error: couldn't find the following:\n";
#    print "$hvloc/Haploview.jar\n";
#    exit;
#}


########################################
# read prekno-file
###################################

my @prekno_arr;
my %prekno_hash;
my %prekno_txt;
if ($prekno_file) {
    print "start reading prekno-file\n";
    die ("$prekno_file not existing") unless open PF, "< $prekno_file";
    while (my $line =<PF>){
	my @cells = @{&split_line_ref(\$line)};
	push @prekno_arr,$cells[0];
	if ($cells[5]) {
	    unless ($prekno_txt{$cells[0]} =~ /$cells[5]/) {
		next if ($cells[4] > 5.0e-08);
		unless (exists $prekno_txt{$cells[0]}) {
		    $prekno_txt{$cells[0]} .= "$cells[0];$cells[5]($cells[4])";
		    $prekno_hash{$cells[0]} = 1;
		}
		else {
		    $prekno_txt{$cells[0]} .= ";$cells[5]($cells[4])";
		}
	    }
#	    print "$cells[0]\n";
#	    print "$line\n";
#	    sleep(1);
	}
    }
    close PF;
    print "finished reading prekno-file\n";
}



###################################
## BEGIN
####################################3

#print "start\n";

######################################
##  if cols are not sure, check them
######################################
my $count_p=0;
if ($check){
    print "$usage\n";
    print "SNPCOL\tPCOL\tCHRCOL\tPOSCOL\n";

    my $begz = gzopen("$bestfile", "rb")  or die "Cannot open $bestfile: $gzerrno\n" ;
    my $line;
    $begz->gzreadline(my $line);
    my $phead = $line;
    while ($count_p++ < 5){
	my @cells = @{&split_line_ref(\$line)};
	printf "%s\t%s\t%s\t%s\n",$cells[$snpcol-1],$cells[$pcol-1],$cells[$chrcol-1],$cells[$poscol-1];
	$begz->gzreadline($line);
    }
    $begz -> gzclose();
    print "\n";
    print "\nHeader of original File:\n";

    my @cells = @{&split_line_ref(\$phead)};
    $count_p=0;
    foreach (@cells){
	$count_p++;
	printf "$count_p\t$_\n";
    }
    exit 1;
}









#####################################
### prepare workdir
#####################################

#my $workdir = "$sloc/areator.$out_name.$bestfile";
my $workdir = "./tmp.areator.$out_name.$bestfile";

if (0) {
    while (-e $workdir) {
	$workdir .= ".a";
    }
}

print "$workdir\n";

use File::Path;
use Cwd;
my @created = mkpath(   ## $created ?
			"$workdir",
			{verbose => 0, mode => 0750},
    );

my $rootdir = &Cwd::cwd();
chdir ($workdir);

if ($phase2) {
    $bfile = "$hapmap_ref_root/plink_p2/single_chr/hapmap_CEU_r23a_filtered" if ($bfile eq "");
    $bfile = "$hapmap_ref_root/plink_p2b/single_chr/hapmap_CEU_r23a" if ($bfile eq "");
}
else {
    $bfile = "$hapmap_ref_root/plink_p3/hapmap3_r2_b36_fwd.CEU.TSI.FOUNDERS.qc.poly" if ($bfile eq "");
}






######################################
##  other reGene file if found in directory
######################################

my $gene_list_0610_plink = "$hapmap_ref_root/debakker/0610/refGene_0610.txt";

my @refgene_files = `ls $refdir/refGene*`;
if (@refgene_files > 0) {
    $gene_list_0610_plink = $refgene_files[0];
    chomp ($gene_list_0610_plink);
}

die "no gene file: $gene_list_0610_plink" unless (-e $gene_list_0610_plink);

#print "found $gene_list_0610_plink\n";
#sleep(4);



#####################################
### read out of best file.
#####################################


my $bestbasic = "$out_name.top";

my $ns = 0;
my %det_str;

my $bestfile_path = "$rootdir/$bestfile";

#if (-e $bestbasic) {
if (0) {
    print "take local bestbasic file ($bestbasic)\n";
    $bestfile_path = "bestbasic.gz";
    &mysystem("gzip -c $bestbasic > $bestfile_path");
}

print "create best_basic out of: $bestfile_path\n";

my %chr_hash ;
my %bb_hash ;
my $snpfile = "$bestfile.snps";
my $bhead;
my $bb_head;

while ($ns == 0) {
    my $begz = gzopen("$bestfile_path", "rb")  or die "Cannot open $bestfile_path: $gzerrno\n" ;

    
    die $! unless open SF, "> $snpfile";
    die $! unless open BB, "> $bestbasic";
    
    
    $begz->gzreadline($bhead);
    chomp($bhead);
    
    my @cells = @{&split_line_ref(\$bhead)};
    if (@cells > $detcol_beg) {
	foreach my $cc ($detcol_beg-1 .. $#cells) {
	    $det_str{"SNP"} .= "\t".$cells[$cc];
	}
    }
    
    print SF $cells[$snpcol-1]."\t".$cells[$pcol-1]."\n";
    print BB "@cells\n";
    


    
    $bb_head .= "SNP";
    $bb_head .= " CHR";
    $bb_head .= " BP";
    $bb_head .= " P";

    $bb_head .= " ".$cells[8];

    $bb_head .= " SE";
    $bb_head .= " A1A2";
    $bb_head .= " ".$cells[$fre1col-1];
    $bb_head .= " ".$cells[$fre2col-1];
    $bb_head .= " INFO";
    $bb_head .= " Dir";
    $bb_head .= " ngt";
    $bb_head .= " LD-friends($r2th).p$pth2";
    $bb_head .= " range.left";
    $bb_head .= " range.right";
    $bb_head .= " span(kb)";
    $bb_head .= " LD-friends(0.6).p$pth2";
    $bb_head .= " range.left.6";
    $bb_head .= " range.right.6";
    $bb_head .= " span.6(kb)";

#    $bb_head .= " range.left.4";
#    $bb_head .= " range.right.4";
#    $bb_head .= " span.4(kb)";
	
    $bb_head .= " gwas_catalog_span.6";
    $bb_head .= " genes.6.50kb(dist2index)";
  
    
    my $ns_loc = 0;
    my $mhc = 0;
    
    while ($begz->gzreadline(my $line)){
	my @cells = @{&split_line_ref(\$line)};
	
	
#    print "$ns_loc rows read\n" if ($ns_loc++ % 1000 == 0);
#    print "atcu rows r\n";
	
#	next if ($cells[$pcol-1] > .05 || $cells[$pcol-1] eq "NA" || $cells[$pcol-1] == 0);
	next if ($cells[$pcol-1] > $pth2 || $cells[$pcol-1] eq "NA" || $cells[$pcol-1] == 0);
	
	if ($onechr) {
	    next if ($cells[$chrcol-1] != $onechr);
	}
	
#########!!!!!!!!!!!!!!!###########
#########!!!!!!!!!!!!!!!###########
#########!!!!!!!!!!!!!!!###########
#	unless ($Xchr) {
#	    next if ($cells[$chrcol-1] > 22);
#	}
#	else {
#	    next if ($cells[$chrcol-1] != 23);
#	}
#########!!!!!!!!!!!!!!!###########
#########!!!!!!!!!!!!!!!###########
#########!!!!!!!!!!!!!!!###########
	
	$chr_hash{$cells[$chrcol-1]} = 1;
	
#	print $cells[$pcol-1]."\n";
#	print $cells[$snpcol-1]."\n";
	$ns++;
	print SF $cells[$snpcol-1]."\t".$cells[$pcol-1]."\n";
	print BB "@cells\n";


	$bb_hash{$cells[$snpcol-1]} = $cells[$snpcol-1];
	$bb_hash{$cells[$snpcol-1]} .= " ".$cells[$chrcol-1];
	$bb_hash{$cells[$snpcol-1]} .= " ".$cells[$poscol-1];
	$bb_hash{$cells[$snpcol-1]} .= " ".sprintf "%.3e",$cells[$pcol-1];
	$bb_hash{$cells[$snpcol-1]} .= " ".$cells[$escol-1];
	$bb_hash{$cells[$snpcol-1]} .= " ".$cells[$secol-1];
	$bb_hash{$cells[$snpcol-1]} .= " ".$cells[$a1col-1]."/".$cells[$a2col-1];
	$bb_hash{$cells[$snpcol-1]} .= " ".$cells[$fre1col-1];
	$bb_hash{$cells[$snpcol-1]} .= " ".$cells[$fre2col-1];
	$bb_hash{$cells[$snpcol-1]} .= " ".$cells[$qualicol-1];
	if ($cells[$faicol-1] ne "") {
	    $bb_hash{$cells[$snpcol-1]} .= " ".$cells[$faicol-1];
	}
	else {
	    $bb_hash{$cells[$snpcol-1]} .= " ".$cells[$qualicol-1];
	}
	$bb_hash{$cells[$snpcol-1]} .= " ".$cells[$ngtcol-1];
#	print 	$bb_hash{$cells[$snpcol-1]} ;
#	print 	"\n" ;
	




    }
    
    
    $begz -> gzclose();
    close SF;
    close BB;
    if ($ns == 0) {
	$pth = $pth * 10;
	$pth2 = $pth2 * 10;
	$pth2 = 0.01 if ($pth2 > .01);
	print "new p-th: $pth\n";
	die if ($pth > 1);
    }
}
#print "fuck it\n";
#exit;;



if ($ns ==0) {
    die $! unless open FILE, "> $rootdir/$out_name.2.areator.txt";
    print FILE "no SNPs meet threshold\n";
    close FILE;
    die "no SNPs left\n" ;
}
#print "$ns SNPs meet threshold\n";
#exit;


#########
## check which chromosomes
#########




my @chro ;
foreach my $chr_loc (sort keys %chr_hash){
    print "this chromosome is in: $chr_loc\n";
    push @chro, $chr_loc;
}
#@chro = `cut -f$chrcol -d " " $bestbasic | sort | uniq | grep -v CHR`;#
#print "@chro";
#exit;


#######################################
#### which SNPs are found in reference

my %bestsnps = ();

#print "@chro\t$bestbasic\n";
#exit;


#######################################
#### genes in clumps

my %gene_hash = ();
my %range_hash = ();

print  "reference gene_list: $gene_list_0610_plink\n";
unless (-e "refGene.loc") {
    print "create local gene list\n";
    die $! unless open GI, "< $gene_list_0610_plink";
    die $! unless open GO, "> refGene.loc.tmp";
    while (my $line = <GI>){
	my @cells = @{&split_line_ref(\$line)};
	$cells[0] =~ s/chr//;

	print GO "$cells[0]";
	print GO " $cells[2]";
	print GO " $cells[3]";
	print GO " $cells[7]\n";

    }
    close GI;
    close GO;
    &mysystem("mv refGene.loc.tmp refGene.loc");
}



my @gene_name_arr = ();
my @gene_star_arr = ();
my @gene_stop_arr = ();
&mysystem("sort -k1,1n -k2,2n refGene.loc > refGene.loc.sorted.tmp");
&mysystem("mv refGene.loc.sorted.tmp  refGene.loc.sorted");


print "create gene arrays\n";
die $! unless open GI, "< refGene.loc.sorted";
while (my $line = <GI>){
    my @cells = @{&split_line_ref(\$line)};

    my $chr_loc = $cells[0] * 1;
    push @{ $gene_name_arr[$chr_loc] }, $cells[3];
    push @{ $gene_star_arr[$chr_loc] }, $cells[1];
    push @{ $gene_stop_arr[$chr_loc] }, $cells[2];
}
close GI;
 

#my @name_arr_loc = @{ $gene_name_arr[1] };
#print "@name_arr_loc\n";
#exit;


#######################################
#### create local gene-list

#####################################
### plink-lump
#####################################

if ($pliclu) {

    print "plink-clump\n";

    die $! unless open LDA, "> $out_name.clump.areator";
    print LDA "$bb_head\n";


    foreach my $chr (@chro){
	$chr = $chr *1;
	
	
#########!!!!!!!!!!!!!!!###########
#########!!!!!!!!!!!!!!!###########
#########!!!!!!!!!!!!!!!###########
#	unless ($Xchr) {
#	    next if ($chr > 22);
#	}
#	else {
#	    next if ($chr != 23);
#	}

#########!!!!!!!!!!!!!!!###########
#########!!!!!!!!!!!!!!!###########
#########!!!!!!!!!!!!!!!###########
	
	unless (-e "chr$chr.hv.ped.LD") {
	    die unless ($refdir);
	    my $bfile_sc = `ls $refdir/*chr$chr.*bed`;


	    if ($bfile_sc eq "") {
		my $chr_part = "$chr"."_";
		$bfile_sc = `ls $refdir/my*chr$chr_part*bed`;
	    }



	    if ($bfile_sc eq "") {
		my $chr_part = "$chr"."_";
		$bfile_sc = `ls $refdir/*chr$chr_part*bed`;
	    }
	    if ($bfile_sc eq "") {
		my $chr_part = "window.$chr.";
		$bfile_sc = `ls $refdir/*$chr_part*bed`;
	    }
#	    print "$bfile_sc\n";
#	    sleep(10);
	    if ($bfile_sc eq "") {
		die "no plink binary for this chromosome";
	    }
	    chomp($bfile_sc);
	    $bfile_sc =~ s/.bed$//;

	    my @bfile_sc_arr = split "\n", $bfile_sc;

	    if (@bfile_sc_arr != 1) {
		print "\n\nError: number of reference-beds:".@bfile_sc_arr."\n";
		exit;
	    }


	    &mysystem_nodie ("$ploc/plink --memory 2000 --bfile $bfile_sc --make-bed --extract $snpfile --out $out_name.window.$chr") unless (-e "$out_name.window.$chr.bim");
	    next unless (-e "$out_name.window.$chr.bim");

	    unless (-e "refGene.loc.chr$chr") {
		die $! unless open RI, "< refGene.loc";
		die $! unless open RO, "> refGene.loc.chr$chr";
		while (my $line = <RI>){
		    my @cells = @{&split_line_ref(\$line)};

		    next unless ($cells[0] == $chr);
		    print RO "@cells\n";
		}
		close RI;
		close RO;
	    }


#	    my $sys = "$ploc/plink --bfile $out_name.window.$chr --clump $snpfile --clump-verbose --clump-p1 $pth --clump-p2 $pth2 --clump-r2 $r2th --clump-kb $window_size --clump-range refGene.loc --clump-range-border 100 --out $out_name.clump.chr$chr" ;
	    my $sys = "$ploc/plink --memory 2000 --bfile $out_name.window.$chr --clump $snpfile --clump-verbose --clump-p1 $pth --clump-p2 $pth2 --clump-r2 $r2th --clump-kb $window_size  --out $out_name.clump.chr$chr  --clump-range refGene.loc.chr$chr --clump-range-border 100" ;

	    ## plink2 does not like --clump-range (if the file is  too long)


	    print "$sys\n";
#	    exit;

	    if (-e "$out_name.clump.chr$chr.clumped") {


#		my @log_str = `tail -2 $out_name.clump.chr$chr.log`;
#		my @log_str = `tail -3 $out_name.clump.chr$chr.log`; ## plink2
		my @log_str = `tail -4 $out_name.clump.chr$chr.log`; ## plink2
#		unless ($log_str[0] =~ /Analysis finished/) {
#		print "$log_str[0]\n";
		unless ($log_str[0] =~ /Results written/) {  ## plink2
		    unless ($log_str[1] =~ /Results written/) {  ## for longer lines
			print "Error: clumping not successfulL: see $out_name.clump.chr$chr.log\n";
			exit;		
		    }
		}

	    }
	    else {
		&mysystem_nodie ($sys) 
	    }
#	    exit;

	    unless (-e "$out_name.clump.chr$chr.clumped") {

		if (-e "$out_name.clump.chr$chr.log") {
		    my @log_str = `tail -3 $out_name.clump.chr$chr.log`; ## plink2
		    if ($log_str[0] =~ /Warning: No significant/) {  ## plink2
			print "chromosome $chr without significant index SNPs\n";
		    }
		    else {
			print "Error: something went wrong with clumping chromosome $chr: $sys\n";
			print "test: $log_str[0]\n";
			exit;
		    }
		}

	    }
	    else {
		&mysystem ("cp $out_name.clump.chr$chr.clumped $rootdir");
		
#	    &mysystem_nodie ("$ploc/plink --bfile $bfile_sc --clump $snpfile --clump-verbose --clump-p1 $pth --clump-p2 $pth --clump-r2 $r2th --clump-kb $window_size --out $out_name");
		
		print "first chr $chr\n";

#	    exit;

		die $! unless open LDI, "< $out_name.clump.chr$chr.clumped";
		die $! unless open LDO, "> $out_name.clump.chr$chr.r2";
		
		
		my $index = "";
		my $index_pos = 0;
		my @ld_friends = ();
		my %ld_hash = ();
		my %dist_hash = ();
		
		my %ld_hash_6 = ();
		my %dist_hash_6 = ();
		
		
		my @prekno_hash = ();
		
		
#	    my $prekno_out = "";
		my $genes = "-";
		my $range = "";
		my $span = "";
		my $span_left = 0;
		my $span_right = 0;
		
		my $genes_6 = "-";
		my $range_6 = "";
		my $span_6 = "";
		my $span_left_6 = 0;
		my $span_right_6 = 0;
		
#	    my $span_4 = "";
		my $span_left_4 = 0;
		my $span_right_4 = 0;
		
		while (my $line = <LDI>){
		    my @cells = @{&split_line_ref(\$line)};
		    if (@cells == 11 && $cells[0] ne "CHR"){   ### this is not checked yet !!!!!!!!!!!!
			$index = $cells[2];
			$index_pos = $cells[3];
			@ld_friends= ();
			%ld_hash= ();
			%dist_hash= ();
			%ld_hash_6= ();
			%dist_hash_6= ();
			$range = "";
			$span = "";
			$span_left = 0;
			$span_right = 0;
			$range_6 = "";
			$span_6 = "";
			$span_left_6 = 0;
			$span_right_6 = 0;

			$span_left_4 = 0;
			$span_right_4 = 0;
#		    $prekno_out = "";
		    }

		    if ($cells[0] eq "RANGE:") {
#		    $range_hash{$index} = $cells[1];
			$range = $cells[1];
			$range =~ s/chr[0-9]*://;
			$range =~ s/\.\./ /;
#		    print "$range\n";
		    }
		    if ($cells[0] eq "SPAN:") {
			$span = $cells[1];
			$span =~ s/kb//;
		    }

		    if ($cells[0] eq "GENES:") {
#		    $gene_hash{$index} = $cells[1];
			$genes = $cells[1];
			if ($genes eq "") {
			    $genes = "-";
			}

			my $stop = 0;
			while ($stop == 0) {
			    my $line_loc = <LDI>;
			    if ($line_loc =~ /-----/) {
				$stop = 1;
			    }
			    else {
				my @cells = @{&split_line_ref(\$line_loc)};
				$genes .= ",$cells[0]" if (@cells > 0);
			    }
			}

#		    print $bb_hash{$index}."\n";
#		    sleep(1);
			#######################
			## here print it out, 
			print LDA $bb_hash{$index};

			my $ld_str = "";
			my $prekno_out_2 = "";
			foreach my $lsnp (reverse sort {$ld_hash{$a} cmp $ld_hash{$b} }
					  keys %ld_hash)
			{
			    $ld_str .="$lsnp($ld_hash{$lsnp}/$dist_hash{$lsnp}),";
			    if (exists $prekno_txt{$lsnp}) {

				if (0) {
				    print "$prekno_txt{$lsnp}\n";
				    print "dist: $dist_hash{$lsnp}\n";
				    print "left: $span_left_6\n";
				    print "right: $span_right_6\n";
				    sleep(1);
				}
				if ($dist_hash{$lsnp} >= $span_left_6) {
				    if ($dist_hash{$lsnp} <= $span_right_6) {
					$prekno_out_2 .= "/($ld_hash{$lsnp})".$prekno_txt{$lsnp};
				    }
				}
			    }
			}
			$ld_str =~ s/,$//;
			if ($ld_str eq ""){
			    $ld_str = "-";
			}

			$prekno_out_2 =~ s/,$//;
			if ($prekno_out_2 eq ""){
			    $prekno_out_2 = "-";
			}



			if ($range eq ""){
			    $range = "- -";
			}
			if ($span eq ""){
			    $span = "-";
			}




			my $ld_str_6 = "";
			foreach my $lsnp_6 (reverse sort {$ld_hash_6{$a} cmp $ld_hash_6{$b} }
					    keys %ld_hash_6)
			{
			    $ld_str_6 .="$lsnp_6($ld_hash_6{$lsnp_6}/$dist_hash_6{$lsnp_6}),";
			}
			$ld_str_6 =~ s/,$//;
			if ($ld_str_6 eq ""){
			    $ld_str_6 = "-";
			}

			my $span_calc = $span_right - $span_left;
			my $span_calc_6 = $span_right_6 - $span_left_6;
			my $span_calc_4 = $span_right_4 - $span_left_4;

			print LDA " ".$ld_str;
#		    print LDA " ".$range;
			print LDA " ".sprintf "%.1f",$index_pos + $span_left * 1000;
			print LDA " ".sprintf "%.1f",$index_pos + $span_right * 1000;
#		    print LDA " ".$span_right;
#		    print LDA " ".$span;
			print LDA " ".$span_calc;
#		    print LDA " ".$genes;



			print LDA " ".$ld_str_6;
			print LDA " ".sprintf "%.1f",$index_pos + $span_left_6 * 1000;
			print LDA " ".sprintf "%.1f",$index_pos + $span_right_6 * 1000;
#		    print LDA " ".$span_left_6;
#		    print LDA " ".$span_right_6;
			print LDA " ".$span_calc_6;


#		    print LDA " ".sprintf "%.1f",$index_pos + $span_left_4 * 1000;
#		    print LDA " ".sprintf "%.1f",$index_pos + $span_right_4 * 1000;
#		    print LDA " ".$span_calc_4;
			




			my @name_arr_loc = @{ $gene_name_arr[$chr] };
			my @star_arr_loc = @{ $gene_star_arr[$chr] };
			my @stop_arr_loc = @{ $gene_stop_arr[$chr] };
			
			my $left_m = $index_pos + $span_left_6*1000 ;
			my $right_m = $index_pos + $span_right_6*1000 ;
			my %gene_hash= ();
			my $gene_str = "";
			foreach my $ac (0..$#name_arr_loc) {
#			    print $name_arr_loc[$ac];
#			    print " ".$star_arr_loc[$ac];
#			    print " ".$stop_arr_loc[$ac];
#			    print " ".$left_m;
#			    print " ".$right_m."\n";
			    my $gene_left = $star_arr_loc[$ac] - 50000;
			    my $gene_right = $stop_arr_loc[$ac] + 50000;
			    my $name_loc = $name_arr_loc[$ac];
			    
			    if ($gene_left < $right_m ) {
				if ($gene_right > $left_m ) {
				    unless (exists $gene_hash{$name_loc}) {
					
					my $dist = 0;
					if ($gene_right < $index_pos) {
					    $dist = $gene_right - $index_pos;
					}
					if ($gene_left > $index_pos) {
					    $dist = $gene_left - $index_pos;
					}
					$dist = sprintf "%.1f",$dist/1000;
					
					
					
#				    print $name_loc."(".$dist.")"."\n";
					$gene_str .= ",".$name_loc."(".$dist.")";
					$gene_hash{$name_loc} = 1;
				    }
				    
				}
			    }
			}  ## end gene array

			$gene_str =~ s/^,//;
			$gene_str = "-" if ($gene_str eq "");
#		    $prekno_out = "-" if ($prekno_out eq "");

			print LDA " ".$prekno_out_2;
			print LDA " ".$gene_str;




			print LDA "\n";

			if (0) {
			    print "chr: $chr\n";
			    print $bb_hash{$index};
			    print " ".$range;
			    print " ".$span_left;
			    print " ".$span_right;
			    print " ".$span_calc;
			    print " ".$span_left_6;
			    print " ".$span_right_6;
			    print " ".$span_calc_6;
			    print " ".$span;
			    print " ".$gene_str;
			    print " ".$prekno_out_2;

			    print " ".$span_left_4;
			    print " ".$span_right_4;
			    print " ".$span_calc_4;
			    print "\n";


#			print "@name_arr_loc\n";

			    sleep(1);
			}
		    }


		    next unless (@cells == 6);
		    next unless ($cells[2] > 0);

#		if (exists $prekno_txt{$cells[0]}) {
#		    $prekno_out .= "/".$prekno_txt{$cells[0]};
#		    push @prekno_friends,$prekno_txt{$cells[0]}."$cells[0]($cells[2])"; 
#		    $prkno_hash{$cells[0]} = $prekno_txt{$cells[0]; 
#		}



#		push @ld_friends,"$cells[0]($cells[2])"; 
		    $ld_hash{$cells[0]} = $cells[2]; 
		    $dist_hash{$cells[0]} = $cells[1]; 

		    $span_left = $cells[1] if ($cells[1] < $span_left);
		    $span_right = $cells[1] if ($cells[1] > $span_right);

		    if ($cells[2] >= 0.6) {
			$span_left_6 = $cells[1] if ($cells[1] < $span_left_6);
			$span_right_6 = $cells[1] if ($cells[1] > $span_right_6);
			$ld_hash_6{$cells[0]} = $cells[2]; 
			$dist_hash_6{$cells[0]} = $cells[1]; 

		    }


		    if ($cells[2] >= 0.4) {
			$span_left_4 = $cells[1] if ($cells[1] < $span_left_4);
			$span_right_4 = $cells[1] if ($cells[1] > $span_right_4);

		    }

		    if (0) {
			print LDO "$index";
			print LDO "\t$cells[0]";
			print LDO "\t$cells[2]";
			print LDO "\n";
			$bestsnps {$cells[0]} = 1;
		    }
		}
		close LDI;
		close LDO;


#	    &mysystem ("cp $out_name.clump.chr$chr.r2 $rootdir");

#	    chdir ($rootdir);

#	    print "copied, now exit\n";

	    }
	}
#	exit;
    }
    
    
#/fg/debakkerscratch/ripke/plink/1.08/src/plink --bfile /home/radon01/sripke/bakker_ripke/hapmap_ref/subchr/hapmap3_r2_b36_fwd.consensus.qc.poly.chr20.CEUTSI.phased.bgl --clump daner_IBD_CD7.all --clump-verbose --clump-p1 0.0001 --clump-p2 0.0001 --clump-r2 0.20 --clump-kb 3000 --out plink2
    
    
    

#    exit ;


    close LDA;
    &mysystem ("cp $out_name.clump.areator $rootdir");
    chdir ($rootdir);
    &mysystem ("sort -k4,4g $out_name.clump.areator > $out_name.clump.areator.sorted");



    my $lcc =0;

    my $mhc = 0;
    die $! unless open IN, "< $out_name.clump.areator.sorted";
    die $! unless open OUT, "> $out_name.clump.areator.sorted.1mhc";
    while (my $line = <IN>){
	my @cells = @{&split_line_ref(\$line)};

	if ($cells[1] == 6 && $cells[2] > 25000000 && $cells[2] < 35000000) {
	    if ($mhc == 0) {
		$mhc = 1;
	    }
	    else {
		next;
	    }
	}
	$lcc++;
	print OUT "@cells\n";
    }

    if ($lcc < 2) {
	print "\n\n********* Warning: no index SNPs left\n";
	&mysystem ("txt2xls --pcol 3 --cogr 4,12,13,16 --txt $out_name.clump.areator.sorted.1mhc --xls $out_name.clump.areator.sorted.1mhc.xls");	
	exit;
    }

#    &mysystem ("txt2xls --pcol 3 --cogr 4,12,13,17 --txt $out_name.clump.areator.sorted --xls $out_name.clump.areator.sorted.xls");
    unless ($noxls) {
	&mysystem ("txt2xls --pcol 3 --cogr 4,12,13,16 --txt $out_name.clump.areator.sorted.1mhc --xls $out_name.clump.areator.sorted.1mhc.xls");
    }
    print "enough is enough\n";
    exit;
}



