#!/usr/bin/env perl
use strict;

srand(0);



### on LISA: sjamem up to 2000 means with 16 cores the 32 GB machines
### if sjamem more than 2000 -> 64 GB nodes and reduced number of cores

#######################################################################################
###
###  GENE ANNOTATION:
###
###   /home/radon01/sripke/bakker_ripke/hapmap_ref/impute2_ref/gene_ref$ prepare_refGene refGene_0413.txt
###
###########################################################################################

#print "********************************\nremember refGene annotaion\n";
#print "********************************\n";
#sleep(1);



## fasta file (used from HRC)
## ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz


# trying now without multithreading minimac
#my $ncpus_multi = 8;

### what is done where:
#
#
#
#
#  my.wget (vcf_download):
#        - download vcf and genetic maps
#       
#
#  my.prepvcf_filtnorm (bcftools_filtnorm):
#        - create filtnorm-vcf with mac > 4 and normalize multiallelic SNPs
#
#
#  prepare_ref_mm (prep_minimac):
#        - create minimac reference out of filtnorm vcf
#        - taken out of the pipeline for now
#
#
#  my.prepvcf_imp2 (bcftools_impute2):
#        - make impute reference out of filtnorm vcf (here some SNP identifier are introduced)
#
#
#  vcf2plink (vcf2plink):
#        - create plink binary out of filtnorm vcf
#
#  refformat_legend_bim (refformat):
#        - reformat legend and bimfiles (this will be different to minimac):
#           - take only first SNP if SNPname has many SNPs divided by semicolon
#           - INDELs into I and D
#           - multi occurence with chr:pos_a1_a2
#           - record changes into changes.gz files
#        - translate the new sample names into samples-file and fam-file per chromosome
#
#  legend_chunk (chunk)
#        - define chunks with --nsnps out of filtnorm positions
#           
#  pobed (plink_anc)
#        - separate plink binaries for each ancestry including frequencies
#
#
#
#
#
#    in imputation module: checkpos check for SNP overlap and then also for alleles if position gets translated, indels with full length alleles.
#                           also from readref maybe (or the position will tell)
#   - put the multi warning in refformat into a separate file (not only in changes)
#   - compare phasing with eagle


#############################
# load utility functions
#############################

use FindBin;
use lib "$FindBin::Bin";
use Ricopili::Utils qw(trans);
use Ricopili::Version;



my $serial = 0;

my $ploc = &trans("p2loc");
my $qloc = &trans("queue");
my $hmloc = &trans("hmloc");
#my $vloc = &trans("vloc");

my $email = &trans("email");
my $loloc = &trans("loloc");
my $bcrloc = &trans("bcrloc");
my $bcmd = &trans("batch_jobcommand");

#######################################


###################################
# variables
####################################


if ($bcmd eq "SERIAL") {
    $serial = 1;
    print "-----------------------------------------------------\n";
    print "switched on SERIAL mode because of configuration file\n";
}




#my $version = "1.0.2";
my $progname = $0;
$progname =~ s!^.*/!!;
my $command_line = "$progname @ARGV";

my $rootdir = "";
my $info_txt = "";


my $imp_phased_postfix = ".impute.phased";

my $out_templ ="";
my $vcf_templ ="";
my $mach_templ ="";
my $imp2_templ ="";
my $hm3_templ ="";
my $gm_template = "genetic_map_chrXXX_combined_b37.txt";
my $gmchr_template = "genetic_map_chrXXX_combined_b37.chr.txt";
my $gmchr_template_23 = "genetic_map_chrXXX_combined_b37.chr.txt.23";

my $vcf_site = "";
my $gm_site = "http://mathgen.stats.ox.ac.uk/impute/ALL_1000G_phase1integrated_feb2012_impute/";

my $walltime = 4;

my $scsize = 5;

my $mac_th = 4;

my $sample_root = "";
my $sepa = 1;


#   661 AFR
#    347 AMR
#    504 EAS
#    503 EUR
#      1 GROUP
#    489 SAS


my @pop_fams = qw /afr amr eas eur sas/;
my @pop_fams;


my $mhc_head =5;
my $q16_head =1;
my $walltimeplus = 0;

my $job_bn_th = 1000;
my $chunksnps = 30000; ## min number of SNPs in each chunk

use Getopt::Long;
GetOptions( 
    "out_templ=s"=> \$out_templ,
    "chunksnps=i"=> \$chunksnps,
    "sample_root=s"=> \$sample_root,
    "outname=s"=> \my $outname,
    "bn_job=i"=> \$job_bn_th,
    "force1"=> \my $force1,
    "vcf_templ=s"=> \$vcf_templ,
    "vcf_site=s"=> \$vcf_site,
    "mach_templ=s"=> \$mach_templ,
    "imp2_templ=s"=> \$imp2_templ,
    "scsize=i"=> \$scsize,
    "imputepostfix=s"=> \$imp_phased_postfix,
    "help"=> \my $help,
    "debug"=> \my $debug,
    "noprio"=> \my $noprio,
    "chr=i"=> \my $sichr,
    "begchr=i"=> \my $begchr,
    "endchr=i"=> \my $endchr,
    "subname=s"=> \my $subname,
    "mhc=s"=> \my $mhc,
    "mhead=i"=> \$mhc_head,
    "q16=s"=> \my $q16,
    "qheadd=i"=> \$q16_head,
    "mac_th=i"=> \$mac_th,
    "walltimeplus=i"=> \$walltimeplus,
    "serial"=> \my $serial_sw,
    "sepa=i"=> \$sepa,
    
    );



if ($serial_sw) {
    $serial = 1;
}


############################################################
## testing binaries
##############################################################
my @test_scripts;


my $mach2impute_script = "mach2impute";               ### not needed for now
my $preprefgene_script = "prepare_refGene";           ### my.pipeline_tar
my $vcf2plink_script = "vcf2plink";                   ### my.pipeline_tar
my $refformat_script = "refformat_legend_bim";        ### my.pipeline_tar
my $chunk_script = "legend_chunk";                    ### my.pipeline_tar
my $plinkanc_script = "plink_anc";                    ### my.pipeline_tar
my $wget_script = "my.wget";                          ### my.pipeline_tar
my $prepvcf_script_filtnorm = "my.prepvcf_filtnorm";  ### my.pipeline_tar
my $prepvcf_script_imp2 = "my.prepvcf_imp2";          ### my.pipeline_tar
my $prepmm_script_bgz = "prepare_ref_mm_bgz";         ### my.pipeline_tar
my $prepmm_script_tabix = "prepare_ref_mm_tabix";     ### my.pipeline_tar
my $prepmm_script_m3vcf = "prepare_ref_mm_m3vcf";     ### my.pipeline_tar
my $vcf2bcf_script = "vcf2bcf";                       ### my.pipeline_tar
my $filtnorm_translate_script = "filtnorm_translate_changes";                 ### my.pipeline_tar
my $filtnorm_filter_script = "filtnorm_filter";                 ### my.pipeline_tar
#my $vcf_script = "$vloc/vcftools";                    ### 
my $annot_hg19pos_script = "annot_hg19pos";           ### 
my $impute2beagle_script = "impute2beagle2";          ### not needed for now
my $prepare_hm_ref_2_script = "prepare_hm_ref_2";     ### my.pipeline_tar
my $beagle2plink_script = "beagle2plink";             ### not needed for now
my $plink_script = "$ploc/plink";                     ### 
my $ref2subchr2_script = "ref2subchr2";               ### my.pipeline_tar
my $beagle2impute_script = "beagle2impute";           ### my.pipeline_tar
my $floc2sumfrq_script = "my.floc2sumfrq";            ### my.pipeline_tar
my $floc2sumfrq_script2 = "my.floc2sumfrq2";          ### my.pipeline_tar
my $refinfo_script = "refinfo";                       ### my.pipeline_tar
my $mutt_script = "mail";                             ### my.pipeline_tar
my $blue_script = "blueprint";                        ### my.pipeline_tar

#push @test_scripts, $mach2impute_script ;
#push @test_scripts, $vcf_script ;
#push @test_scripts, $annot_hg19pos_script ;
#push @test_scripts, $impute2beagle_script ;
push @test_scripts, $prepare_hm_ref_2_script ;
push @test_scripts, $filtnorm_translate_script ;
push @test_scripts, $filtnorm_filter_script ;
push @test_scripts, $refformat_script ;
push @test_scripts, $chunk_script ;
push @test_scripts, $vcf2plink_script ;
push @test_scripts, $plinkanc_script ;
#push @test_scripts, $beagle2plink_script ;
#push @test_scripts, $plink_script ;
push @test_scripts, $ref2subchr2_script ;
#push @test_scripts, $beagle2impute_script ;
push @test_scripts, $floc2sumfrq_script ;
push @test_scripts, $floc2sumfrq_script2 ;
push @test_scripts, $refinfo_script ;


$rp_header =~ s/MODULE/refdir_navi  /;

print "$rp_header\n" ;



print ".......testing necessary binaries....\n" if ($debug);

my $err_scr = 0;
die $! unless open FILE1, "> get_scripts_on_broad.txt";
foreach my $scr_name (@test_scripts) {
    my $scr_path = '';
    
    for my $path ( split /:/, $ENV{PATH} ) {
	if ( -f "$path/$scr_name" && -x _ ) {
	    print "$scr_name\tfound in $path\n" if ($debug);
	    $scr_path = "$path/$scr_name";
	    last;
	}
    }
    unless ( $scr_path ) {
	$err_scr = 1;
#	print FILE1 "cp /home/unix/sripke/bin/$scr_name ./\n";
	print "!!Error!! : No $scr_name command available\n" ;
    }
 
}
close FILE1;
if ($err_scr == 1) {
    print "-> have a look at get_scripts_on_broad.txt\n";
    die ;
}
&mysystem ("rm get_scripts_on_broad.txt");





print ".......testing email program....\n" if ($debug);

my $err_scr = 0;
my $noti = 1;
{
    my $scr_path = '';
    
    for my $path ( split /:/, $ENV{PATH} ) {
	if ( -f "$path/$mutt_script" && -x _ ) {
	    print "$mutt_script\tfound in $path\n" if ($debug);
	    $scr_path = "$path/$mutt_script";
	    last;
	}
    }
    unless ( $scr_path ) {

	print "!!Warning!! : No $mutt_script command available, trying mutt\n" if ($debug);

	$mutt_script = "mutt";
	for my $path ( split /:/, $ENV{PATH} ) {
	    if ( -f "$path/$mutt_script" && -x _ ) {
		print "$mutt_script\tfound in $path\n" if ($debug);
		$scr_path = "$path/$mutt_script";
		last;
	    }
	}
	unless ( $scr_path ) {
#	    $err_scr = 1;
	    print "!!Warning!! : No $mutt_script command available, no email notifications\n" ;
	    $noti=0;
#	    sleep (3);
	}
    }
 
}
die if $err_scr == 1;


print "....all necessary binaries found....\n" if ($debug);
print "------------------------------------\n" if ($debug);
#push @scripts,"id_tager_3";



if ($help || $out_templ eq "" || $sample_root eq ""){
    print "usage: $progname bim1 bim2

version: $rp_version

      options:

        --help               this help-text
        --debug              some more detailed output
        --outname STRING     just as identifier
        --sample_root STRING file with identifiers and sex and ancestry, e.g. integrated_call_samples_v3.20130502.ALL.panel
        --out_templ STRING   outname as temlate, e.g. ALL.august.chrXXX.recal.vrcut.EUR.beagle.vcf.imp
        --vcf_templ STRING   vcf template, 
        --vcf_site STRING    ftp site containing vfc files, 

        --mach_templ STRING mach template, 20101123.chrXXX, pointing to 20101123.chrXXX.hap.gz, 20101123.chrXXX.map and chrXXX.annotation.txt  

        --imp2_templ STRING  impute2 template, 20101123.chrXX, pointing to 20101123.chrXXX.hap.gz, 20101123.chrXXX.legend.gz  

           e.g. /humgen/1kg/processing/allPopulations_wholeGenome_august_release/calls/chrXXX/ALL.august.chrXXX.recal.vrcut.EUR.beagle.vcf

        --imputepostfix STRING  postfix after HM3-format, default: $imp_phased_postfix
                                NOPOSTFIX for empty

        --scsize INT      size of window in Mb
        --chr INT         only this chromosome, if not named, take all 22

        --begchr INT      beginning of chromosome list
        --endchr INT      end of chromosome list

        --noprio          no priority queue (for testing sets)

        --mac_th INT     include only variants with more than this minor-allele-count (default: $mac_th)

        --subname STRING  subchr_postfix

        --mhc STRING      take phased MHC reference: 
                             T1DGC_REF.bgl.phased T1DGC_REF.markers   -> --mhc T1DGC_REF

        --mhead INT       discard INT header rows, defailt: $mhc_head

        --q16 STRING      take phased reference for 16q22:
                             16q22_OMNI_HM3_snpname.bgl.phased 16q22_OMNI_HM3_snpname.markers --q16 16q22_OMNI_HM3_snpname.bgl

        --qhead INT       discard INT header rows, defailt: $q16_head

        --bn_job INT        submit INT jobs at a time
        --force1            do not exit if same fail, but do this only once
     
        --chunksnps INT    minimum number of SNPs in each chunk (default: $chunksnps) 

        --serial          no sending jobs to queue all in one run
                              -> usually only used for testing 
        --sepa INT        use INT number of parallel jobs during serial


        --walltimeplus INT  increase walltime by INT hours (important for bigger reference files), 
                                 prep_minimac_m3vcf gets ten times this amount since minimac is now single threaded 



 --out_templ is mandatory
 --outname is mandatory
 --sample_root is mandatory



for classic 1KG phase 3 use this (files will be downloaded after confirmation);

--vcf_site ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/
--vcf_templ ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
--sample_root integrated_call_samples_v3.20130502.ALL.panel







## outdated:

annotation of 1KG positions: 
       /humgen/1kg/analysis/main_project/Aug2010_whole_genome_release/bc.bi.ncbi.um.2of4.nogenotypes.refgene_annotated.vcf

/fg/debakkerscratch/ripke/hapmap_ref/1KG_got2d_0311/bc.bi.ncbi.um.2of4.nogenotypes.refgene_annotated.vcf.chr1





 created by Stephan Ripke 2010 at MGH, Boston, MA
 in the frame of the PGC
\n";
    exit 2;
}

if ($imp_phased_postfix eq "NOPOSTFIX"){
    $imp_phased_postfix = "";
}

unless ($mhc || $q16) {
unless ($out_templ =~ /chrXXX/) {
    print "please out_templ with chrXXX\n";
    die;
}
}


unless ($outname) {
    print "please provide --outname\n";
    exit;
}




###################################################
###  system call with test if successfull
###################################################

sub mysystem(){
    my ($systemstr)="@_";
    system($systemstr);
    my $status = ($? >> 8);
    die "$systemstr\n->system call failed: $status" if ($status != 0);
}








#####################################
# print array to file
####################################

sub a2file {
    my ($file, @lines)=@_;
    die $! unless open FILE, "> $file";
    foreach (@lines){
	print FILE $_;
    }
    close FILE;
}



##########################################
# subroutine to split a plink-output-line with references
##########################################

sub split_line_ref {
    my ($line)=${$_[0]};
    chomp($line);
    $line =~ s/^[\s]+//g;
    my @cols=  split /\s+/, $line;
    \@cols;
}


#####################################
# append array to file with newline
####################################

sub a2filenew_app {
    my ($file, @lines)=@_;
    die "$!: $file" unless open FILE, ">> $file";
    foreach (@lines){
	print FILE "$_\n";
    }
    close FILE;
}






#####################################
# send jobs to cluster and also send navi again
#####################################

my $sjadir = "";
my $sjaweek = 0;
my $sjaname = "";
my $sjarow = "";
my @sjaarray;
my $sjamem = 0;
my $sjatime = -1;
my $sjamaxjobs = 30000;
my $sjainfofile = "$loloc/reference_dir_info";
unless (-e $sjainfofile) {
    print "log-file ($sjainfofile) is not existing\n";
    print "please check loloc in ~/ricopili.conf\n";
    exit;
}
my $sjainfotxt = "";
my $sjamulti = 0;



#####################################
# print array to file
####################################

sub a2file {
    my ($file, @lines)=@_;
    die $! unless open FILE, "> $file";
    foreach (@lines){
	print FILE $_;
    }
    close FILE;
}
#####################################
# print array to file with newline
####################################

sub a2filenew {
    my ($file, @lines)=@_;
    die $! unless open FILE, "> $file";
    foreach (@lines){
	print FILE "$_\n";
    }
    close FILE;
}




sub send_jobarray {

    die "send_jobarray with undefined variables, dir" if ($sjadir eq "");
    die "send_jobarray with undefined variables, name" if ($sjaname eq "");
    die "send_jobarray with undefined variables, array" if (@sjaarray == 0);
    die "send_jobarray with undefined variables, mem" if ($sjamem == 0);
    die "send_jobarray with undefined variables, time" if ($sjatime < 0);
    die "send_jobarray with undefined variables, info" if ($sjainfotxt eq "");

    print "Running job: $sjaname\n";

    my $now = localtime time;
    $now =~ s/ /_/g;


    if ($sjaname eq "finished") {

	my $fini_message ;
	$fini_message .= "\n\n##################################################################\n";
	$fini_message .= "##### CONGRATULATIONS: \n";
	$fini_message .= "##### reference_pipeline finished successfully:\n";
	$fini_message .= "##### $sjainfotxt\n";

	$fini_message .= "##### have a look at the wiki page\n"; 
	$fini_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n";
	$fini_message .= "##################################################################\n";
	print "$fini_message\n";

	
	die $! unless open SUC, "> success_file";
	print SUC $fini_message."\n";
	close SUC;

	if ($noti == 1) {
	    &mysystem ('cat success_file | '.$mutt_script.' -s RP_reference_finished '.$email) ;
	}

	my $sjarow      = $sjainfotxt."\t$sjaname\t$now";
	&a2filenew_app("$sjainfofile",$sjarow);


	exit;

    }


    chdir ($sjadir);
    my $jobfile = "$sjaname.job_list";
    while (-e $jobfile) {
	$jobfile .= ".s";
    }
    &a2filenew ($jobfile, @sjaarray);


#    print "sleep, see $sjaname.job_list\n";
#    sleep (5);

    $walltime = $sjatime;
    my $nsja = @sjaarray;

    my $nsja_loc = $nsja;
    if ($nsja_loc > 30000) {
	$nsja_loc = 30000;
    }

    my $multi_txt = "";
    if ($sjamulti > 0) {
	$multi_txt = "--multi $nsja_loc,$sjamulti";
    }

    ### with array


    my $sja_week_str = "";
    if ($sjaweek > 0) {
	$sja_week_str = "--week 1";
    }


    my $old_cmd = `tail -1 $sjainfofile | head -1`;

    my $sjacontent = "$sjaname.".@sjaarray;

    my $sjarow_part = $sjainfotxt."\t$sjacontent\t";
    my $sjarow      = $sjainfotxt."\t$sjacontent\t$now";
#    $message = $info_txt."\t$message\t$now";

    &a2filenew_app("$sjainfofile",$sjarow);

    if ($old_cmd =~ /$sjarow_part/){
	unless ($force1 ){
	    my $err_message ;
	    
	    $err_message .= "##################################################################\n";
	    $err_message .= "##### Error: \n";
	    $err_message .= "##### step $sjaname has been done repeatedly without any progress\n";
	    $err_message .= "##### reference pipeline stopped\n";
	    $err_message .= "##### $sjacontent\n";
	    $err_message .= "##### if reason does not appear obvious\n";
	    $err_message .= "##### have a look at the wiki page\n"; 
	    $err_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n";
	    $err_message .= "##### or contact the developers\n";
	    $err_message .= "##### version: $rp_version\n";
	    $err_message .= "##### internal: $sjarow_part\n";
#	    $err_message .= "##### old_cmd: $old_cmd\n";
	    $err_message .= "##################################################################\n";
	    print "$err_message\n";

	    die $! unless open ERR, "> error_file";
	    print ERR $err_message."\n";
	    close ERR;

	    if ($noti == 1) {
		&mysystem ('cat error_file | '.$mutt_script.' -s RP_refdir_error '.$email) ;
	    }
	    
	    exit;
	}

    }



    #################################
    ## starting the job array
    ##################################
    if ($serial) {

	print "starting step $sjaname with ".@sjaarray." jobs\n" if ($debug);
	print "running up to $sepa parallel jobs.\n" if ($debug);


	my $jc = 1;
#	my $job_str = "";
	my @job_sepa_arr;

	foreach (@sjaarray) {
	    print "running job $jc...\n" ;
	    push @job_sepa_arr, "$_ &";
#	    $job_str .= "$_ & \n";

	    if ($jc % $sepa == 0) {
		push @job_sepa_arr, "wait";
		#		$job_str .= "wait\n";
		my $sepa_file = "$sjaname.sepa.$jc";
		&a2filenew ($sepa_file,@job_sepa_arr);
		print "sepa_file: ".$sepa_file."\n" if ($debug);
		&mysystem("chmod u+x $sepa_file");
		&mysystem("./$sepa_file");
		@job_sepa_arr = ();
	    }
	    $jc++;
	    
	}

	if (@job_sepa_arr > 0) {
	    $jc--;
	    push @job_sepa_arr, "wait";
	    
	    my $sepa_file = "$sjaname.sepa.$jc";
	    &a2filenew ($sepa_file,@job_sepa_arr);
	    print "sepa_file: ".$sepa_file."\n" if ($debug);
	    &mysystem("chmod u+x $sepa_file");
	    &mysystem("./$sepa_file");
	}
	

    }
    else { 
	my $sys_loc = "$blue_script $sja_week_str --noerr --njob $nsja_loc --array $jobfile --wa $sjatime --mem $sjamem --j --na $jobfile $multi_txt";
	&mysystem ($sys_loc);
    }
    
    
    
    $command_line =~ s/--force1//;


    my $wt_file = "$sjadir/j.$jobfile.id";
    chdir "$rootdir" or die "something strange";    




    if ($serial) {
	my $sys_re = "$command_line";
	&mysystem ($sys_re);
	exit;
    }
    else {
	my $sys_re = "$blue_script --njob $job_bn_th -b \"$command_line\" --wa 4 --di -j --fwt $wt_file --na _ref_$outname";
	
	&mysystem ($sys_re);

    }

    print "------------------------------------------------------------\n";
    print "$nsja jobs successfully submitted\n";
    print "please see tail of $sjainfofile for regular updates\n";
    print "also check bjobs -w for running jobs\n";
    print "you will be informed via email if errors or successes occur\n";
    print "------------------------------------------------------------\n";

    exit;


}








#####################################
# BEGIN
#####################################



use File::Copy;
use File::Path;
use Cwd;


$rootdir = &Cwd::cwd();
$info_txt = "command:\t\"$command_line\"\tdir:\t$rootdir";
$sjainfotxt = "$rootdir\t$command_line";
#print "$info_txt\n";





#######################################################
#######################################################
## overwork vcf
######################################################
######################################################

my @job_vtools = ();
my @job_btools1 = ();
my @job_prepmm_bgz = ();
my @job_prepmm_tabix = ();
my @job_prepmm_m3vcf = ();
my @job_vcf2bcf = ();
my @job_filtnormtra = ();
my @job_filtnormfil = ();
my @job_btools2 = ();
my @job_count = ();
my @job_soll = ();
my @job_cpgz = ();
my @job_vcf_dl = ();
my @job_gm_dl = ();
my @job_phased = ();
my @job_bgl = ();
my @job_plink = ();
my @job_refformat = ();
my @job_chunk = ();
my @job_frq = ();
my @job_pobed = ();
my @job_sumfrq = ();
my @job_annot = ();

my $subchr_dir = "$rootdir/subchr";
if ($subname){
    $subchr_dir .= ".$subname";
}



#my @created = mkpath(
#    $subchr_dir,
#    {verbose => 0, mode => 0750},
#    );



my $now = localtime time;

my @cells = split /\s+/, $now;
my $date = "$cells[1]"."_"."$cells[4]";



##############################################################################
## check if refGene files are present, will be needed later during postimp
##################################################################################


#my @refgene_files = `ls $subchr_dir/refGene*`;
my @refgene_files = `ls refGene*`;
if (@refgene_files == 0) {
    
    print "********************************\n";
    print "no refGene file found\n";
    print "********************************\n";
#    print "here is an old one: /psych/genetics_data/ripke/1000Genomes_reference/1KG_Oct14/1000GP_Phase3_sr/subchr/refGene_XXX.txt.out\n";
    print "********************************\n";
    print "\its better to create an updated one yourself:\n";
    print "********************************\n";
#    print "cd $subchr_dir\n";

    print "wget -nv http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz\n";
    print "zcat refGene.txt.gz > refGene.txt\n";
    print "$preprefgene_script refGene.txt\n";
    print "mv refGene.txt.gz sich.refGene.txt.gz\n";
#    print "rm refGene_1114.txt\n";
#    print "cd $rootdir\n";
    print "********************************\n";


    print "You can do that yourself or let this pipeline do it.\n";
    print "Do you want to have that done right now (y/n)?\n";



    my $answer = lc <>;
    chomp $answer;
    if ($answer eq "y" || $answer eq "Y"){


	

	print "downloading most recent file...\n";
#	chdir $subchr_dir;
	my $sys = "wget -nv http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz ";
	&mysystem ($sys);
	
	print "unzip...\n" if ($debug);
	$sys = "gunzip -c refGene.txt.gz > refGene.txt";	
	&mysystem ($sys);

	print "reformat...\n" if ($debug);
	$sys = "$preprefgene_script refGene.txt";
	&mysystem ($sys);

	print "rename...\n" if ($debug);
	$sys = "mv refGene.txt.out refGene.txt.$date.out";
	&mysystem ($sys);

	print "backup...\n" if ($debug);
	$sys = "mv refGene.txt.gz bak.refGene.txt.gz";
	&mysystem ($sys);
	
	print "clean...\n" if ($debug);
	$sys = "rm refGene.txt";
	&mysystem ($sys);
	

	
	print "success with refGene file...\n";
	print "press any key and <return> to continue\n";



	$answer = lc <>;
	print "********************************\n";
	print "********************************\n";
	




	
    }
    else {
	print "go ahead and then restart the pipeline\n";
	exit;
    }
	    

}


##############################################
### double check
##################################################


my @refgene_files = `ls refGene*`;
if (@refgene_files != 1) {
    print "Error: there is not exactly one refGene - file\n";
    print "@refgene_files\n";
    exit;
}
my $refgene_file = $refgene_files[0];
chomp($refgene_file);


##############################################################################
## check if gwas catalog is present
##################################################################################


#my @refgene_files = `ls $subchr_dir/refGene*`;
unless (-e "gwascatalog.$date.rp.txt") {
#my @cat_files = `ls gwascatalog.$date.rp.txt`;
#if (@cat_files == 0) {
    
    print "********************************\n";
    print "no gwascatalog.txt file found\n";
    print "********************************\n";
    print "please create one yourself\n";
    print "********************************\n";
#    print "cd $subchr_dir\n";

    print "wget -nv http://www.ebi.ac.uk/gwas/api/search/downloads/full\n";
    print 'tr " " "_" < full > gwascatalog.txt.ow'."\n";
    print 'cat gwascatalog.txt.ow | cut -f2,8,12,13,21,22,28 > gwascatalog.txt.ow.short'."\n";
    print 'awk \'$7<5.0e-08{if (length($2)> 25 ) {$2 = substr($2,1,15)"..."}; print $6,$3,$4,$4,$7,$2"("$1")"}\' gwascatalog.txt.ow.short | awk \'NF > 5\' > gwascatalog.txt.ow.short.ow'."\n";
    print 'sort -k2,2n -k3,3n -k5,5g gwascatalog.txt.ow.short.ow > '."gwascatalog.$date.rp.txt\n";
## IMPROVE HERE
##    sed 's/(/ /' gwascatalog.Aug_2017.rp.txt | sort -k1,1 -k6,6  -k5,5g | awk '{key=$1" "$6; if (key in u) {} else {$6=$6"("$7;$7="";print; u[key]=1}}' > gwascatalog.Oct_2017.rp.txt
##
    print 'gzip full'."\n";
#    print ''."\n";
 #   print ''."\n";
    print "********************************\n";


    print "You can do that yourself or let this pipeline do it.\n";
    print "Or do you want to have that done right now by the pipeline (y/n)?\n";



    my $answer = lc <>;
    chomp $answer;
    if ($answer eq "y" || $answer eq "Y"){

	my $now = localtime time;
	
	my @cells = split /\s+/, $now;
	my $date = "$cells[1]"."_"."$cells[4]";

	my $sys = "rm -f full";
	&mysystem ($sys);

	print "downloading most recent file...\n";
#	chdir $subchr_dir;
	my $sys = "wget -nv http://www.ebi.ac.uk/gwas/api/search/downloads/full ";
	&mysystem ($sys);
	
#	print "replace space...\n";
#	$sys = 'tr " " "_" < full > gwascatalog.txt.ow';	
#	&mysystem ($sys);


	
	print "reformat...\n" if ($debug);
	die $!."(full)" unless open IN, "< full";
	die $!."(gwascatalog.txt.ow.short)" unless open OUT, "> gwascatalog.txt.ow.short";
	while (my $line = <IN>) {
	    $line =~ s/ /_/g;
	    my @cells = split /\t/, $line;
	    if ($cells[27] < 5.0e-08) {
		if ($cells[21] ne "" && $cells[21] ne "" && $cells[11] ne "" && $cells[27] ne "") {
		    if (length ($cells[7]) > 25) {
			$cells[7] = substr($cells[7],0,15);
			$cells[7] .= "...";
		    }

#		    if ($cells[11] eq "X") {
#			$cells[11] = 23;
#		    }
#		    if ($cells[11] eq "Y") {
#			$cells[11] = 24;
#		    }		    
		    print OUT $cells[21];
		    print OUT "\t".$cells[11];  ## chromosome
		    print OUT "\t".$cells[12];
		    print OUT "\t".$cells[12];
		    print OUT "\t".$cells[27];
		    print OUT "\t".$cells[7];
		    print OUT "(".$cells[1].")";
		    print OUT "\n";
		}
	    }
	}

	print "sort...\n" if ($debug);
	$sys = 'sort -k2,2n -k3,3n -k5,5g gwascatalog.txt.ow.short > gwascatalog.'.$date.'.rp.txt';	
	&mysystem ($sys);
        $sys = 'gzip -f full';
	&mysystem ($sys);
	
#	print "debug gwascatalog.rp.txt\n";	
#	exit;



	print "success with gwascatalog file...\n";
	print "press any key and <return> to continue\n";

	print "********************************\n";
	print "********************************\n";



	$answer = lc <>;

	
    }
    else {
	print "go ahead and then restart the pipeline\n";
	exit;
    }
	    

}

#print "debug\n";
#exit;


##################################################################
###################################################################
###################################################################
###################################################################
###################################################################
###################################################################
###
### check here if you want to run with only one chromosome
###
###################################################################
###################################################################
###################################################################
###################################################################
###################################################################


#my %snp_names;
my @legend_arr = ();
my $chr_start = 1;
my $chr_end = 22;
if ($sichr){
    $chr_start = $sichr;
    $chr_end = $sichr;
}

if ($begchr && $endchr){
    $chr_start = $begchr;
    $chr_end = $endchr;
}

my $hapout_name ;

if ($mach_templ ne "") {
    $out_templ = $mach_templ;
    $imp_phased_postfix = ".hap.phased";
}

if ($mhc || $q16) {
    &mysystem ("touch $rootdir/rootdir_done");
}


### look for sample root
my $sr_found = 0;





unless (-e "$sample_root") {
    print "********************************\n";
    print "--sample_root ($sample_root) not found, should I try to download at $vcf_site (y/n)?\n";

    my $sys = "wget -n $vcf_site/$sample_root ";

    my $answer = lc <>;
    chomp $answer;
    if ($answer eq "y"){

	print "trying this:\n$sys\n";
	&mysystem ($sys);
	print "seems to have worked...\n" if ($debug);

    }
    else {
	print "Manually I would suggest this:\n";
	print "$sys\n";
	print "exit now\n";
	exit;
    }
    
}

print "********************************\n";
print "********************************\n";
print "********************************\n";

unless (-e "$sample_root") {
    print "********************************\n";
    print "Error: --sample_root ($sample_root) not found\n";
    exit;
}



################################################
## out_templ
################################################



unless (-e "reference_templ") {
    print "create file reference_templ\n";
    die $! unless open RT, "> reference_templ";
    print RT "vcf_site      $vcf_site\n";
    print RT "vcf_template  $vcf_templ\n";
    print RT "out_template  $out_templ\n";

    close RT;
}

    






### create sample_fam
my $sample_fam = $sample_root.".fam";
my $sample_ca = $sample_root.".content";
my $sample_fini = $sample_root.".fini";
my %ca_hash;




#HG00097 GBR EUR female
#id3 id3 0 0 1 -9	HG00099 GBR EUR 2

## read out the collection of continental ancestries
unless (-e $sample_ca) {
    print "read continental info from $sample_root\n" if ($debug);
    
    die $!."($sample_root)" unless open IN, "< $sample_root";

    my $line = <IN>; #1
    my @cells = @{&split_line_ref(\$line)};
    my $super_pop_col = -1;
    foreach my $cc (0..$#cells){
	if ($cells[$cc] eq "super_pop") {
	    $super_pop_col = $cc;
	}
    }
    if ($super_pop_col == -1){
	print "Error: no super_pop header in $sample_root\n" ;
	exit;
    }
    while (my $line = <IN>){
	my @cells = @{&split_line_ref(\$line)};
	my $ca = $cells[$super_pop_col];
	$ca_hash{$ca} = 1;

    }
    close IN;

    print "write continental info into $sample_ca\n"  if ($debug);
    die $!."($sample_ca)" unless open OUT, "> $sample_ca";
    foreach my $ca (keys %ca_hash){
	print OUT "$ca\n";
	print "$ca\n"  if ($debug);
    }
    close OUT;

}
#print "debug\n";
#exit;


print "reading existing populations\n"  if ($debug);
## read ancesty collection
die $!."($sample_ca)" unless open IN, "< $sample_ca";
while (my $line = <IN>){
    my @cells = @{&split_line_ref(\$line)};
    push @pop_fams, $cells[0];
}
close IN;


## create famfiles, etc.
unless (-e $sample_fini) {


    print  "read sex of panel file\n"  if ($debug);
    die $!."($sample_root)" unless open IN, "< $sample_root";

    my $line = <IN>; #1
    my @cells = @{&split_line_ref(\$line)};

    my %sex_hash;
    my $sex_col = -1;
    my $iid_col = -1;
    my $panel_sex = 1;
    
    foreach my $cc (0..$#cells){
	if ($cells[$cc] eq "gender") {
	    $sex_col = $cc;
	}
	if ($cells[$cc] eq "sample" || $cells[$cc] eq "IID") {
	    $iid_col = $cc;
	}		
    }
    
    if ($sex_col == -1){
	print "Warning: no gender header in $sample_root\n";
	$panel_sex = 0;
    }
    if ($iid_col == -1){
	print "Error: no sample or iid header in $sample_root\n";
	exit;
    }

    

    while (my $line = <IN>){
	my @cells = @{&split_line_ref(\$line)};
	my $sex = 0;
	if ($cells[$sex_col] eq "female" || $cells[$sex_col] == 2) {
	    $sex = 2;
	}
	if ($cells[$sex_col] eq "male" || $cells[$sex_col] == 1) {
	    $sex = 1;
	}
	my $iid = $cells[$iid_col];
#	print "$iid has sex $sex\n";	
	$sex_hash{$iid} = $sex;
    }

    close IN;



    
    if ($sex_col == -1){
	print  "read sex of chr22-samples\n";
	my @chr22_samples = `ls *chr22*samples`;
	if (@chr22_samples > 1) {
	    print "Error: many *chr22*samples files\n";
	    exit;
	}
	if (@chr22_samples < 1) {
	    print "Error: no *chr22*samples files\n";
	    print "checkout chrX samples\n";
	    @chr22_samples = `ls *chrX*samples`;
	    if (@chr22_samples != 1) {
		print "Error: no *chrX*samples files either\n";
		exit;
	    }

	}
	

	die $!."($chr22_samples[0])" unless open CHR, "< $chr22_samples[0]";
	while (my $line = <CHR>){
	    my @cells = @{&split_line_ref(\$line)};
	    my $iid = $cells[0];
	    my $sex = $cells[3];
	    $sex_hash{$iid} = $sex;
	}
	close CHR;
    }

#    exit;

    

    
    print "creating root population famfiles\n"  if ($debug);
    my @famlines;
    die $!."($sample_root)" unless open IN, "< $sample_root";
    die $!."($sample_fam)" unless open OUT, "> $sample_fam";
    
    my $line = <IN>; #1

    my @cells = @{&split_line_ref(\$line)};
    my $super_pop_col = -1;
    my $pop_col = -1;
    my $iid_col = -1;
    foreach my $cc (0..$#cells){
	if ($cells[$cc] eq "super_pop") {
	    $super_pop_col = $cc;
	}
	if ($cells[$cc] eq "population" || $cells[$cc] eq "pop") {
	    $pop_col = $cc;
	}
	if ($cells[$cc] eq "sample" || $cells[$cc] eq "IID") {
	    $iid_col = $cc;
	}		
    }
    
    if ($super_pop_col == -1){
	print "Error: no super_pop header in $sample_root\n";
	exit;
    }
    if ($pop_col == -1){
	print "Error: no pop or population header in $sample_root\n";
	exit;
    }
    if ($iid_col == -1){
	print "Error: no sample or iid header in $sample_root\n";
	exit;
    }


    
    while (my $line = <IN>){
	my @cells = @{&split_line_ref(\$line)};
	my $spop_cell = $cells[$super_pop_col];
	my $pop_cell = $cells[$pop_col];
	if ($spop_cell =~ /_/) {
	    print "----------------------------------------------------------------\n";
	    print "Error: no underscores are allowed for super population column\n";
	    print "when using a new file, please also deleted this file: $sample_ca\n";
	    exit;
	}
	if ($pop_cell =~ /_/) {
	    print "----------------------------------------------------------------\n";
	    print "Error: no underscores are allowed for population column\n";
	    print "when using a new file, please also deleted this file: $sample_ca\n";
	    exit;
	}	
	my $fid = $spop_cell."_".$pop_cell;
	my $iid = $cells[$iid_col];
	my $sex;
	if (exists $sex_hash{$iid}) {
	    $sex = $sex_hash{$iid};
	}
	else {
	    print "Error: $iid does not have an entry in sex specifying file\n";
	    exit;
	}

	
	my $out_line = $fid;
	$out_line .= "\t".$iid;
	$out_line .= "\t0";
	$out_line .= "\t0";
	$out_line .= "\t$sex";
	$out_line .= "\t-9";
	print OUT "$out_line\n";
	push @famlines,$out_line;
    }
    close IN;
    close OUT;
#    &mysystem ("mv $sample_fam.tmp $sample_fam");


    ### write the other ancestry famfiles
    foreach my $ca (@pop_fams) {

	print "creating population famfile for $ca\n"  if ($debug);
	die $!."($sample_fam.$ca)" unless open OUT, "> $sample_fam.$ca";
	
	foreach my $line (@famlines) {
	    my @cells = @{&split_line_ref(\$line)};
	    my @anc = split /_/, $cells[0];
#	    print $anc[0]."\n";
	    if ($anc[0] eq $ca) {
		print OUT $line."\n";
	    }
	}

	close OUT;
	
    }
    &mysystem ("touch $sample_fini"); 
#    print "debug: ".@famlines."\n";
}

#print "debug\n";
#exit;






my $answer = "";
my $answer_gm = "";
my $exit = 0;
my @plink_collection;
my @chunks_collection_T3;
my @chunks_collection_1;
my @chunks_collection_2;
my @chunks_collection_5;
my @chunks_collection_10;
my @chunks_collection_20;
my @backup_arr;
my $gm_name;

#unless (-e "$rootdir/rootdir_done"){


my $answer_rename = "";

foreach my $chrint ($chr_start..$chr_end){

    my $chr = $chrint;
    
    if ($chrint == 23) {
	$chr = "X";
    }
    


    
    my $vcf_name = $vcf_templ;
    $vcf_name =~ s/XXX/$chr/g;
    my $vcf_name_loc;
    
    if (-e $vcf_name) {
	$vcf_name_loc = $vcf_name;
    }
    elsif (-e "loc.".$vcf_name) {
	$vcf_name_loc = "loc.".$vcf_name;
    }
    else {
	my @vcf_files = `ls *chr$chr.samples 2>/dev/null | grep -v impute`;
	if (@vcf_files == 1) {
	    my $vcf_found = $vcf_files[0];
	    chomp ($vcf_found);
	    unless ($answer_rename eq "y" || $answer_rename eq "Y"){
		print "********************************\n";
		print "vcf-file ($vcf_name) not found, but I found another possibility ($vcf_found).\n";
		print "do you want to rename all files with the matching template $vcf_templ (old names are lost) for all chromosomes? (y/n)\n";
		print "comment: thats expected for HRC imputation reference, so please say y\n";
		print "comment: also it's recommend to start with a copy (or a link of the files)\n";
	    }
	    
	    if ($answer_rename eq "") {
		$answer_rename = lc <>;
		chomp $answer_rename;
	    }

	    if ($answer_rename eq "y" || $answer_rename eq "Y"){
		my $vcf_new = $vcf_templ;
		$vcf_name =~ s/XXX/$chr/g;
		my $samples_name_old = $vcf_found;
		my $samples_name_new = $vcf_name;
		$samples_name_new =~ s/.haplotypes.vcf.gz$/.samples/;
		&mysystem ("mv $samples_name_old $samples_name_new");
		push @backup_arr, "mv $samples_name_new $samples_name_old\n";
		
		my $vcf_name_old = $vcf_found;
		my $vcf_name_new = $vcf_name;
		$vcf_name_old =~ s/.samples$/.haplotypes.vcf.gz/;
		&mysystem ("mv $vcf_name_old $vcf_name_new");
		push @backup_arr, "mv $vcf_name_new $vcf_name_old\n";

		my $legend_name_old = $vcf_found;
		my $legend_name_new = $vcf_name;
		$legend_name_new =~ s/.haplotypes.vcf.gz$/.legend.gz/;
		$legend_name_old =~ s/.samples$/.legend.gz/;
		&mysystem ("mv $legend_name_old $legend_name_new");
		push @backup_arr, "mv $legend_name_new $legend_name_old\n";		    
	    }


	}
	if (@vcf_files == 0) {
	    print "Warning: cannot find any matching samples filenames\n"  if ($debug);
	}
	if (@vcf_files > 1) {
	    print "Error: too many matching samples filenames\n";
	    exit;
	}	    
	#	    print "here we go: @vcf_files\n";
    }    
}

if (@backup_arr > 0) {
    die $!."($vcf_templ.backup)" unless open BA, "> $vcf_templ.backup";
    foreach (@backup_arr) {
	print BA$_;
    }
    close BA;
    
    print "-----------------------------------------------\n";
    print "renamed many files\n";
    print "if you want to undo, please have a look at $vcf_templ.backup\n";
    print "please check if you are happy with the results\n";
    print "if yes, then restart\n";
    exit;	
}


my $fasta_file = "$bcrloc/human_g1k_v37.fasta";

unless (-e $fasta_file){
    print "Error: $fasta_file not existing\n";
    exit;
}



foreach my $chrint ($chr_start..$chr_end){

    #
    ###################################
    # copy here
    ####################################
    my $chr = $chrint;
    
    if ($chrint == 23) {
	$chr = "X";
    }
    
    $hapout_name = $out_templ.".impute";
    $hapout_name =~ s/XXX/$chr/g;
    my $vcf_name_loc_filtnorm ;
    my $vcf_name_loc_filtnorm_reform ;
    
    if ($vcf_templ ne "") {


	

	my $vcf_name = $vcf_templ;



#	if ($chr == 23) {
#	    $vcf_name =~ s/XXX/X/g;
#	}
#	else {
	    $vcf_name =~ s/XXX/$chr/g;
#	}




	my $vcf_name_loc;

	if (-e $vcf_name) {
	    $vcf_name_loc = $vcf_name;
	}
	else {
	    $vcf_name_loc = "loc.".$vcf_name;
	    #		my @cat_files = `ls gwascatalog.rp.txt`;
	}


	my $vcf_name_remote = $vcf_templ;
	
	$vcf_name_loc_filtnorm = $vcf_name_loc.".filtnorm.gz";
	my $vcf_name_loc_filtnorm_fini = $vcf_name_loc_filtnorm.".fini";

	$vcf_name_loc_filtnorm_reform = $vcf_name_loc_filtnorm.".reform.gz";
	my $vcf_name_loc_filtnorm_reform_fini = $vcf_name_loc_filtnorm_reform.".fini";



	
	###################################
	# download vcf files
	####################################

	
	unless (-e $vcf_name_loc) {

	    unless ($answer eq "y" || $answer eq "Y"){
		print "********************************\n";
		print "vcf-file ($vcf_name_loc) not found, should I try to download this and all other chromosomes(y/n)?\n";
	    }
	    
	    my $sys = "$wget_script $vcf_site $vcf_name ";
	    #		print "$sys\n";


	    if ($answer eq "") {
		$answer = lc <>;
		chomp $answer;
	    }


	    if ($answer eq "y"){
		print "will put this into parallel jobs:\t$sys\n";
		push @job_vcf_dl, $sys;
	    }
	    else {
		$exit = 1;
		print "Answer: <$answer>\n";
		print "Manually I would suggest this:\n";
		print "$sys\n";
		#			print "exit now\n";
		#			exit;
	    }
	}




	###################################
	# download genetic map files.
	####################################

	$gm_name = $gm_template;

	if ($chr == "X"){
	    $gm_name =~ s/XXX/X_nonPAR/g;
	}
	else {
	    $gm_name =~ s/XXX/$chr/g;
	}



	my $gm_name_loc = "loc.".$gm_name;
	my $gm_name_fini = $gm_name.".fini";

	#	    print "$gm_name_loc now2\n";	    
	unless (-e $gm_name_fini) {

	    if (-e $gm_name_loc) {

		#		    if ($chr == 23){
		#			$gm_name = $gm_template;
		#			$gm_name =~ s/XXX/23/g;
		#		    }
		
		&mysystem ("mv $gm_name_loc $gm_name");
		&mysystem ("touch $gm_name_fini");

	    }
	    else {

		print "---------------------------------\n";
		print "$gm_name_loc not existing\n";
		#		    if (-e $vcf_name_loc) {

		unless ($answer_gm eq "y" || $answer_gm eq "Y"){

		    print "gm-file ($gm_name_loc) not found, should I try to download this and all other chromosomes(y/n)?\n";
		}
		my $sys = "$wget_script $gm_site $gm_name ";
		if ($debug){
		    print "debug: $sys\n";
		    sleep(1);
		}
		
		#		exit;	

		if ($answer_gm eq "") {
		    $answer_gm = lc <>;
		    chomp $answer_gm;
		}
		
		
		if ($answer_gm eq "y" || $answer_gm eq "Y"){
		    
		    print "will put this into parallel jobs:\t$sys\n";
		    push @job_vcf_dl, $sys;
		    
		    
		}
		else {
		    $exit = 1;
		    print "Answer: <$answer>\n";
		    print "Manually I would suggest this:\n";
		    print "$sys\n";
		    #			print "exit now\n";
		    #			exit;
		}
		#		    }
	    }
	}



	###################################
	# filter and convert vcf with bcftools
	####################################

	unless (-e $vcf_name_loc_filtnorm_fini) {

	    ######### on the X only include non-PAR
	    my $chrX_filter = "";
	    if ($chr == "X") {
		$chrX_filter = "--chrX --fam $sample_root.fam";
	    }
	    push @job_btools1, "$prepvcf_script_filtnorm $chrX_filter --vcf $vcf_name_loc --mac_th $mac_th";
	    #		print "$prepvcf_script_filtnorm $chr23_filter --vcf $vcf_name_loc --mac_th $mac_th\n";
	    #		exit;
	}


	
	###################################
	# filter indels
	#     if indels on same position only keep the ones with highest AC
	#     otherwise minimac3 gets in trouble since pipeline translates indels
	#        into I D as alleles and minimac doesn't care about snpnames
	####################################

	unless (-e $vcf_name_loc_filtnorm_reform_fini) {
	    my $sys =  "$filtnorm_filter_script --filtnorm $vcf_name_loc_filtnorm";
	    push @job_filtnormfil, $sys;
	    #		print "$sys\n";
	    #		print "debug\n";
	    #		exit;
	}

	
	
	###################################
	# create impute-format with bcftools
	####################################
	

	my $hapout_name_fini = $hapout_name.".fini";
	
	unless (-e $hapout_name_fini) {
	    #		print "hapout: $hapout_name\n";
	    push @job_btools2, "$prepvcf_script_imp2 --vcf $vcf_name_loc_filtnorm_reform --out $hapout_name";
	    #		push @job_vtools, "$vcf_script --mac $mac_th --gzvcf $vcf_name_loc --IMPUTE --out $hap_name";
	}

	
	#	    else {#
	#		### control log-file
	#		die $!."($hap_name.log)" unless open LOG, "< $hap_name.log";
	#		my $line;
	#		my $last;
	#		while ($line = <LOG>){
	#		    $last = $line;
	#		    chomp($last);
	#		}
	#		unless ($last =~ /^Run Time/){
	#		    print "********************************\n";
	#		    print "Error: check log (did not finish): $hap_name.log\nplease remove $hapout_name\n";
	#		    exit;
	#		}
	#		close LOG;
	#	    }


	

    }

    else {
	print "right now only starting from vcffiles\n";
	exit;
    }

    if (0) {
	if ($mach_templ ne "") {

	    my $mach_name = $mach_templ.".hap";
	    $mach_name =~ s/XXX/$chr/g;


	    #	    print "phased: $out_templ$imp_phased_postfix\n";
	    
	    unless (-e $mach_name) {
		#		print "mach_name: $mach_name\n";
		die unless ("$mach_name.gz");
		push @job_cpgz, "zcat $mach_name.gz > $mach_name.tmp; mv $mach_name.tmp $mach_name";
	    }

	    
	    ###################################
	    # create impute-format with mach2impute
	    ####################################
	    
	    $hapout_name = "$mach_name.hap";
	    my $map_name = $mach_templ.".map";
	    $map_name =~ s/XXX/$chr/g;
	    my $anno_name = "chrXXX.annotation.txt";
	    $anno_name =~ s/XXX/$chr/g;


	    
	    unless (-e $hapout_name) {
		#		print "hapout: $hapout_name\n";
		push @job_vtools, "$mach2impute_script --hap $mach_name --anno $anno_name --map $map_name"
	    }

	}



	if ($imp2_templ ne "") {

	    my $imp2_name = $imp2_templ.".hap";
	    $imp2_name =~ s/XXX/$chr/g;

	    $hapout_name = "$imp2_name";

	    unless (-e $imp2_name) {
		die unless ("$imp2_name.gz");
		push @job_cpgz, "zcat $imp2_name.gz > $imp2_name.tmp; mv $imp2_name.tmp $imp2_name";
	    }

	    my $imp2_lname = $imp2_templ.".legend";
	    $imp2_lname =~ s/XXX/$chr/g;


	    unless (-e $imp2_lname) {
		die unless ("$imp2_lname.gz");
		push @job_cpgz, "zcat $imp2_lname.gz > $imp2_lname.tmp; mv $imp2_lname.tmp $imp2_lname";
	    }




	}
    }

    


    

    if (0) {
	###################################
	# create beagle - format (not done any more)
	####################################
	

	my $phased_name = $out_templ.$imp_phased_postfix;
	$phased_name =~ s/XXX/$chr/g;
	my $phased_name_count = $phased_name.".count";

	#	print "$phased_name\n";
	#	print "$hapout_name\n";

	unless (-e $phased_name) {
	    #	    print "phased_name: $phased_name\n";
	    push @job_phased, "$impute2beagle_script --out $phased_name --chr $chr $hapout_name";
	    #	    print "$impute2beagle_script --out $phased_name --chr $chr $hapout_name\n";
	    #	    push @job_phased, "impute2beagle --exclude $multi_name $hapout_name\n";
	}
	else {
	    unless (-e $phased_name_count) {
		my $sys = "wc -l $phased_name > $phased_name_count";
		print "$sys\n";
		&mysystem ($sys);
	    }
	}

	
	
	my $infopos_name = $out_templ.$imp_phased_postfix.".bgl.info_pos";
	$infopos_name =~ s/XXX/$chr/g;
	my $bgl_name_count = $out_templ.$imp_phased_postfix.".bgl.count";
	$bgl_name_count =~ s/XXX/$chr/g;
	unless (-e $infopos_name) {
	    #	    print "infopos: $infopos_name\n";
	    #	    push @job_bgl, "$prepare_hm_ref_2_script --mac_th $mac_th --not --noaffy $phased_name";
	    push @job_bgl, "$prepare_hm_ref_2_script --not --noaffy $phased_name";
	}
	else {
	    unless (-e $phased_name_count) {
		my $sys = "wc -l $phased_name.annot.markers $phased_name.mono $phased_name.multi > $bgl_name_count";
#		print "$sys\n";
		&mysystem ($sys);
	    }
	}
    }


    my $plink_name = $hapout_name.".plink";
    push @plink_collection, $plink_name;
    my $plink_name_fini = $plink_name.".fini";


    #	my $bgl_name = $out_templ."$imp_phased_postfix.bgl";
    #	$bgl_name =~ s/XXX/$chr/g;

    unless (-e $plink_name_fini) {
	#	    print "plink: $plink_name\n";
	push @job_plink, "$vcf2plink_script --vcf $vcf_name_loc_filtnorm_reform --out $plink_name";
    }


    my $refformat_fini = $hapout_name.".rf.fini";
    unless (-e $refformat_fini) {
	#	    print "plink: $plink_name\n";
	push @job_refformat, "$refformat_script --root $hapout_name --sample $sample_root.fam --gm $gm_name";
    }


    ######################################################################

    my $changes_file = "$hapout_name.legend.gz.rf.gz.changes.gz";
    my $filtnorm_trans_fini = $changes_file;
    $filtnorm_trans_fini =~ s/legend.gz.rf.gz.changes.gz/vcf.gz.fini/;
    
    unless (-e $filtnorm_trans_fini) {

	my $sys = "$filtnorm_translate_script --filtnorm $vcf_name_loc_filtnorm_reform --changes $changes_file";
	push @job_filtnormtra, $sys;
#	print "$sys\n";
    }



    ###############################################################

    my $filtnorm_trans_file = $changes_file;
    $filtnorm_trans_file =~ s/legend.gz.rf.gz.changes.gz/vcf.gz/;
    my $vcf_name_loc_filtnorm_prep_bgz_fini = $filtnorm_trans_file.".bgz.fini";

    unless (-e $vcf_name_loc_filtnorm_prep_bgz_fini) {
	my $sys = "$prepmm_script_bgz $filtnorm_trans_file";
	push @job_prepmm_bgz, $sys;
    }

    ###############################################################

    my $filtnorm_trans_file = $changes_file;
    $filtnorm_trans_file =~ s/legend.gz.rf.gz.changes.gz/vcf.gz/;
    my $vcf_name_loc_filtnorm_prep_tabix_fini = $filtnorm_trans_file.".tabix.fini";

    unless (-e $vcf_name_loc_filtnorm_prep_tabix_fini) {
	my $sys = "$prepmm_script_tabix $filtnorm_trans_file";
	push @job_prepmm_tabix, $sys;
    }

    ###############################################################

    my $filtnorm_trans_file = $changes_file;
    $filtnorm_trans_file =~ s/legend.gz.rf.gz.changes.gz/vcf.gz/;
    my $vcf_name_loc_filtnorm_prep_m3vcf_fini = $filtnorm_trans_file.".m3vcf.fini";

    unless (-e $vcf_name_loc_filtnorm_prep_m3vcf_fini) {
	my $sys = "$prepmm_script_m3vcf $filtnorm_trans_file";
	push @job_prepmm_m3vcf, $sys;
    }


    ###############################################################
    

    my $vcf_name_loc = $filtnorm_trans_file;
    $vcf_name_loc =~ s/\.vcf\.gz/.vcf.bgz/;;
    my $bcf_name_loc = $filtnorm_trans_file;
    $bcf_name_loc =~ s/\.vcf\.gz/.bcf.bgz/;;
    my $bcf_name_loc_fini = $bcf_name_loc.".fini";

    unless (-e $bcf_name_loc_fini) {

	my $sys = "$vcf2bcf_script $vcf_name_loc";
	push @job_vcf2bcf, $sys;
#	print "$sys\n";
    }
#    print "debug: $bcf_name_loc_fini\n";
#    exit;

    ########################################################################

#    my $chunk_fini = $hapout_name.".legend.gz.chunks.fini";
#    unless (-e $chunk_fini) {#
	#	    print "plink: $plink_name\n";
#	push @job_chunk, "$chunk_script --nsnps $chunksnps --legend $hapout_name.legend.gz --out $hapout_name.legend.gz.chunks";
#    }
#    push @chunks_collection, "$hapout_name.legend.gz.chunks";



    ########################################################################
    ## times 1
    my $chunksnps_T3 = sprintf "%d", $chunksnps / 3;
    my $chunk_fini_T3 = $hapout_name.".legend.gz.chunks_T3.fini";
    unless (-e $chunk_fini_T3) {
	#	    print "plink: $plink_name\n";
	push @job_chunk, "$chunk_script --nsnps $chunksnps_T3 --legend $hapout_name.legend.gz --out $hapout_name.legend.gz.chunks_T3";
    }
    push @chunks_collection_T3, "$hapout_name.legend.gz.chunks_T3";


    
    ########################################################################
    ## times 1
    my $chunksnps_1 = $chunksnps * 1;
    my $chunk_fini_1 = $hapout_name.".legend.gz.chunks_1.fini";
    unless (-e $chunk_fini_1) {
	#	    print "plink: $plink_name\n";
	push @job_chunk, "$chunk_script --nsnps $chunksnps_1 --legend $hapout_name.legend.gz --out $hapout_name.legend.gz.chunks_1";
    }
    push @chunks_collection_1, "$hapout_name.legend.gz.chunks_1";


    
    ########################################################################
    ## times 2
    my $chunksnps_2 = $chunksnps * 2;
    my $chunk_fini_2 = $hapout_name.".legend.gz.chunks_2.fini";
    unless (-e $chunk_fini_2) {
	#	    print "plink: $plink_name\n";
	push @job_chunk, "$chunk_script --nsnps $chunksnps_2 --legend $hapout_name.legend.gz --out $hapout_name.legend.gz.chunks_2";
    }
    push @chunks_collection_2, "$hapout_name.legend.gz.chunks_2";





    ########################################################################
    ## times 5
    my $chunksnps_5 = $chunksnps * 5;
    my $chunk_fini_5 = $hapout_name.".legend.gz.chunks_5.fini";
    unless (-e $chunk_fini_5) {
	#	    print "plink: $plink_name\n";
	push @job_chunk, "$chunk_script --nsnps $chunksnps_5 --legend $hapout_name.legend.gz --out $hapout_name.legend.gz.chunks_5";
    }
    push @chunks_collection_5, "$hapout_name.legend.gz.chunks_5";


    ########################################################################
    ## times 10
    my $chunksnps_10 = $chunksnps * 10;
    my $chunk_fini_10 = $hapout_name.".legend.gz.chunks_10.fini";
    unless (-e $chunk_fini_10) {
	#	    print "plink: $plink_name\n";
	push @job_chunk, "$chunk_script --nsnps $chunksnps_10 --legend $hapout_name.legend.gz --out $hapout_name.legend.gz.chunks_10";
    }
    push @chunks_collection_10, "$hapout_name.legend.gz.chunks_10";

    
    ########################################################################
    ## times 20
    my $chunksnps_20 = $chunksnps * 20;
    my $chunk_fini_20 = $hapout_name.".legend.gz.chunks_20.fini";
    unless (-e $chunk_fini_20) {
	#	    print "plink: $plink_name\n";
	push @job_chunk, "$chunk_script --nsnps $chunksnps_20 --legend $hapout_name.legend.gz --out $hapout_name.legend.gz.chunks_20";
    }
    push @chunks_collection_20, "$hapout_name.legend.gz.chunks_20";




    if (1) {
	foreach my $pf_loc (@pop_fams) {
	    
	    my $freq_root = "$plink_name.freq.$pf_loc";
	    my $keep_file = "$sample_fam.$pf_loc";

	    my $bfile_pf = "$plink_name.$pf_loc";


	    #		print "$pf_loc -> $bfile_pf\n";
	    
	    unless (-e "$bfile_pf.fini"){
		#		    push @job_pobed, "$plink_script --bfile $plink_name --out $bfile_pf --keep $keep_file --make-bed --freq";
		push @job_pobed, "$plinkanc_script --bfile $plink_name --out $bfile_pf --keep $keep_file";
	    }
	    


	    #		unless (-e "$freq_root.frq"){

	    #		    my $sys_f = "$plink_script --bfile $bfile_pf --out $freq_root --freq";
	    #		    push @job_frq, "$sys_f";
	    
	    #		    if (0) {
	    #			unless (-e "$keep_file.sex") {
	    #			    print "create $keep_file.sex\n";
	    #			    die $!."($keep_file)" unless open KI, "< $keep_file";
	    #			    die $!."($keep_file.sex)" unless open KO, "> $keep_file.sex";
	    #			    while (my $line = <KI>){
	    #				my @cells = split /\s+/, $line;
	    #				print KO "$cells[0]";
	    #			    print KO "\t$cells[1]";
	    #				print KO "\t$cells[9]";
	    #				print KO "\n";
	    #			    }
	    #			    close KI;
	    #			    close KO;
	    #			}
	    
	    #			my $sys_f = "$plink_script --bfile $bgl_name --out $freq_root --keep $keep_file --freq --update-sex $keep_file.sex";
	    #			push @job_frq, "$sys_f";
	    #		    }
	    #		    else {

	    #		    }
	    #		}




	}
    }
}



if ($exit == 1) {
    print "exit due to error\n";
    exit;

}



if (@job_vcf_dl > 0 ) {

    #	print "starting gunzip/copy jobs\n";

    $sjadir = $rootdir;
    $sjaname = "vcf_download";
    $sjatime = 2 + $walltimeplus;
    $sjamem = 2000; 
    @sjaarray = @job_vcf_dl;
    
    &send_jobarray;


}

#    if (@job_gm_dl > 0 ) {


#	$sjadir = $rootdir;
#	$sjaname = "gm_download";
#	$sjatime = 1;
#	$sjamem = 1000;
#	@sjaarray = @job_gm_dl;

#	&send_jobarray;


#   }	

if (@job_cpgz > 0 ) {

    #	print "starting gunzip/copy jobs\n";

    $sjadir = $rootdir;
    $sjaname = "copyunzip";
    $sjatime = 2 + $walltimeplus;
    $sjamem = 2000;
    @sjaarray = @job_cpgz;
    
    &send_jobarray;


}



if (@job_vtools > 0 ) {

    #	print "starting 2impute jobs\n";

    $sjadir = $rootdir;
    $sjaname = "vcftools";
    $sjatime = 2 + $walltimeplus;
    $sjamem = 4000;
    @sjaarray = @job_vtools;
    
    &send_jobarray;


}

if (@job_btools1 > 0 ) {

    #	print "starting 2impute jobs\n";

    $sjadir = $rootdir;
    $sjaname = "bcftools_filtnorm";
    $sjatime = 2 + $walltimeplus;
    $sjamem = 2000;
    @sjaarray = @job_btools1;
    
    &send_jobarray;


}


if (@job_filtnormfil > 0 ) {

    #	print "starting 2impute jobs\n";

    $sjadir = $rootdir;
    $sjaname = "filter_filtnorm";
    $sjatime = 2 + $walltimeplus;
    $sjamem = 2000;
    @sjaarray = @job_filtnormfil;
    
    &send_jobarray;


}





if (@job_btools2 > 0 ) {

    #	print "starting 2impute jobs\n";

    $sjadir = $rootdir;
    $sjaname = "bcftools_impute2";
    $sjatime = 2 + $walltimeplus;
    $sjamem = 2000;
    @sjaarray = @job_btools2;
    
    &send_jobarray;


}  

#    exit;


if (0) {
    if (@job_phased > 0 ) {
	$sjadir = $rootdir;
	$sjaname = "phased";
	$sjatime = 2 + $walltimeplus;
	$sjamem = 2000;
	@sjaarray = @job_phased;
	
	&send_jobarray;
    }
    if (@job_bgl > 0 ) {
	$sjadir = $rootdir;
	$sjaname = "beagle";
	$sjatime = 2 + $walltimeplus;
	$sjamem = 4000;
	@sjaarray = @job_bgl;
	
	&send_jobarray;
    }
}


if (@job_plink > 0 ) {
    $sjadir = $rootdir;
    $sjaname = "vcf2plink";
    $sjatime = 2 + $walltimeplus;
    $sjamem = 2000;
    @sjaarray = @job_plink;
    
    &send_jobarray;
}


if (@job_refformat > 0 ) {


    $sjadir = $rootdir;
    $sjaname = "refformat";
    $sjatime = 2 + $walltimeplus;
    $sjamem = 2000;
    @sjaarray = @job_refformat;
    
    &send_jobarray;

}




if (@job_filtnormtra > 0 ) {
    
    

    $sjadir = $rootdir;
    $sjaname = "trans_filtnorm";
    $sjatime = 2 + $walltimeplus;
    $sjamem = 2000;
    @sjaarray = @job_filtnormtra;
    
    &send_jobarray;
    
    
}


if (@job_prepmm_bgz > 0 ) {
    
    
    $sjadir = $rootdir;
    $sjaname = "prep_minimac_bgz";
    $sjatime = 2 + $walltimeplus;
    $sjamem = 2000;  ## each core, so mem request for the node is multi that
    @sjaarray = @job_prepmm_bgz;
    
    &send_jobarray;
}

if (@job_prepmm_tabix > 0 ) {
    
    
    $sjadir = $rootdir;
    $sjaname = "prep_minimac_tabix";
    $sjatime = 2 ;
    $sjamem = 2000;  ## each core, so mem request for the node is multi that
    @sjaarray = @job_prepmm_tabix;
    
    &send_jobarray;
}

if (@job_prepmm_m3vcf > 0 ) {
    
    
    $sjadir = $rootdir;
    $sjaname = "prep_minimac_m3vcf";
    $sjatime = 2 + $walltimeplus * 10;
#    $sjamulti = $ncpus_multi;
    $sjamem = 24000;  ## each core, so mem request for the node is multi that
    @sjaarray = @job_prepmm_m3vcf;
    
    &send_jobarray;
}


#exit;

if (@job_vcf2bcf > 0 ) {
    
    
#    $sjaweek = 1;
    $sjadir = $rootdir;
    $sjaname = "vcf2bcf";
    $sjatime = 2 + $walltimeplus;
    $sjamem = 2000;  ## each core, so mem request for the node is multi that
    @sjaarray = @job_vcf2bcf;
    
    &send_jobarray;
    
    
}


if (@job_chunk > 0 ) {

    $sjadir = $rootdir;
    $sjaname = "chunk";
    $sjatime = 2;
    $sjamem = 2000;
    @sjaarray = @job_chunk;
    
    &send_jobarray;

}


if (@job_pobed > 0 ) {
    
    $sjadir = $rootdir;
    $sjaname = "pobed";
    $sjatime = 2 + $walltimeplus;
    $sjamem = 4000;
    @sjaarray = @job_pobed;
    
    &send_jobarray;	
}


#    print "debug\n";
#    exit;



#    if (@job_frq > 0 ) {


#	$sjadir = $rootdir;
#	$sjaname = "freq";
#	$sjatime = 2;
#	$sjamem = 4000;
#	@sjaarray = @job_frq;

#	&send_jobarray;



    #   }


unless (-e "infosum_pos.nsnps") {
    print "write infosum_pos.nsnps\n" if ($debug);

    if (-e "infosum_pos.nsnps.tmp") {
	&mysystem ("rm infosum_pos.nsnps.tmp");
    }
    &mysystem ("touch infosum_pos.nsnps.tmp");
    
    foreach my $p (@plink_collection) {
	&mysystem ("wc -l  $p.bim >> infosum_pos.nsnps.tmp");
	#	    print "$p\n";
    }
    &mysystem ("mv infosum_pos.nsnps.tmp infosum_pos.nsnps");
}


unless (-e "infosum_pos.chunks_T3") {
    print "write infosum_pos.chunks_T3\n" if ($debug);
    if (-e "infosum_pos.chunks_T3.tmp") {
	&mysystem ("rm infosum_pos.chunks_T3.tmp");
    }
    &mysystem ("touch infosum_pos.chunks_T3.tmp");

    foreach my $c (@chunks_collection_T3) {
	&mysystem ("cat  $c >> infosum_pos.chunks_T3.tmp");
    }
    &mysystem ("mv infosum_pos.chunks_T3.tmp infosum_pos.chunks_T3");
}



unless (-e "infosum_pos.chunks_1") {
    print "write infosum_pos.chunks_1\n" if ($debug);
    if (-e "infosum_pos.chunks_1.tmp") {
	&mysystem ("rm infosum_pos.chunks_1.tmp");
    }
    &mysystem ("touch infosum_pos.chunks_1.tmp");

    foreach my $c (@chunks_collection_1) {
	&mysystem ("cat  $c >> infosum_pos.chunks_1.tmp");
    }
    &mysystem ("mv infosum_pos.chunks_1.tmp infosum_pos.chunks_1");
}


unless (-e "infosum_pos.chunks_2") {
    print "write infosum_pos.chunks_2\n" if ($debug);
    if (-e "infosum_pos.chunks_2.tmp") {
	&mysystem ("rm infosum_pos.chunks_2.tmp");
    }
    &mysystem ("touch infosum_pos.chunks_2.tmp");

    foreach my $c (@chunks_collection_2) {
	&mysystem ("cat  $c >> infosum_pos.chunks_2.tmp");
    }
    &mysystem ("mv infosum_pos.chunks_2.tmp infosum_pos.chunks_2");
}


unless (-e "infosum_pos.chunks_5") {
    print "write infosum_pos.chunks_5\n" if ($debug);
    if (-e "infosum_pos.chunks_5.tmp") {
	&mysystem ("rm infosum_pos.chunks_5.tmp");
    }
    &mysystem ("touch infosum_pos.chunks_5.tmp");

    foreach my $c (@chunks_collection_5) {
	&mysystem ("cat  $c >> infosum_pos.chunks_5.tmp");
    }
    &mysystem ("mv infosum_pos.chunks_5.tmp infosum_pos.chunks_5");
}


unless (-e "infosum_pos.chunks_10") {
    print "write infosum_pos.chunks_10\n" if ($debug);
    if (-e "infosum_pos.chunks_10.tmp") {
	&mysystem ("rm infosum_pos.chunks_10.tmp");
    }
    &mysystem ("touch infosum_pos.chunks_10.tmp");

    foreach my $c (@chunks_collection_10) {
	&mysystem ("cat  $c >> infosum_pos.chunks_10.tmp");
    }
    &mysystem ("mv infosum_pos.chunks_10.tmp infosum_pos.chunks_10");
}


unless (-e "infosum_pos.chunks_20") {
    print "write infosum_pos.chunks_20\n" if ($debug);
    if (-e "infosum_pos.chunks_20.tmp") {
	&mysystem ("rm infosum_pos.chunks_20.tmp");
    }
    &mysystem ("touch infosum_pos.chunks_20.tmp");

    foreach my $c (@chunks_collection_20) {
	&mysystem ("cat  $c >> infosum_pos.chunks_20.tmp");
    }
    &mysystem ("mv infosum_pos.chunks_20.tmp infosum_pos.chunks_20");
}







    

#    print "debug\n";
#    exit;


    if (0){
    
	foreach my $pf_loc (@pop_fams) {
	    
	    unless (-e "sumfrq.$pf_loc") {
		
		
		my $sys_f = "$floc2sumfrq_script --chrstart $chr_start --chrend $chr_end --out sumfrq.$pf_loc --bim $out_templ"."$imp_phased_postfix".".bgl.bim --frq $out_templ.$pf_loc.frq";
		#	    print "$sys_f\n";
		push @job_sumfrq, "$sys_f";
		
	    }
	    
	    unless (-e "sumfrq.$pf_loc.done") {
		
		
		my $sys_f = "$floc2sumfrq_script2 --chrstart $chr_start --chrend $chr_end --out sumfrq.$pf_loc --bim $out_templ"."$imp_phased_postfix".".bgl.bim --frq $out_templ.$pf_loc.frq";
		#	    print "$sys_f\n";
		push @job_sumfrq, "$sys_f";
		
	    }
	    
	}
	
	
	if (@job_sumfrq > 0 ) {
	    
	    $sjadir = $rootdir;
	    $sjaname = "sumfreq";
	    $sjatime = 2 + $walltimeplus;
	    $sjamem = 2000;
	    @sjaarray = @job_sumfrq;
	    
	    &send_jobarray;
	    
	}
    }






    
#print "$rootdir\n";
#exit;
    
#    &mysystem ("touch $rootdir/rootdir_done");
	

#}





#unless (-e "popdirs_done") {


foreach my $pf_loc (@pop_fams) {
    
    unless (-e "pop_$pf_loc") {&mysystem ("mkdir pop_$pf_loc")};#

    print "----------------------------------------------\n" if ($debug);
    print "----------------------------------------------\n" if ($debug);
    print "work on ancestry subdir pop_$pf_loc\n" if ($debug);
    print "----------------------------------------------\n" if ($debug);   

    print "create reference_templ to ancestry subdir\n" if ($debug);   
    unless (-e "pop_$pf_loc/reference_templ") {
	print "create file reference_templ\n" if ($debug);
	die $! unless open RT, "> pop_$pf_loc/reference_templ";
	print RT "vcf_site      $vcf_site\n";
	print RT "vcf_template  $vcf_templ\n";
	print RT "out_template  $out_templ\n";
	print RT "bfile_template  $out_templ.impute.plink.$pf_loc\n";
	close RT;
    }

    print "copy chunkinfo to ancestry subdir\n" if ($debug);   

    print "copy chunkinfo_T3 to ancestry subdir\n" if ($debug);   
    unless (-e "pop_$pf_loc/infosum_pos.chunks_T3") {
	&mysystem ("cp infosum_pos.chunks_T3 pop_$pf_loc/");
    }
    
    print "copy chunkinfo_1 to ancestry subdir\n" if ($debug);   
    unless (-e "pop_$pf_loc/infosum_pos.chunks_1") {
	&mysystem ("cp infosum_pos.chunks_1 pop_$pf_loc/");
    }
    
    print "copy chunkinfo_2 to ancestry subdir\n" if ($debug);   
    unless (-e "pop_$pf_loc/infosum_pos.chunks_2") {
	&mysystem ("cp infosum_pos.chunks_2 pop_$pf_loc/");
    }
    print "copy chunkinfo_5 to ancestry subdir\n" if ($debug);   
    unless (-e "pop_$pf_loc/infosum_pos.chunks_5") {
	&mysystem ("cp infosum_pos.chunks_5 pop_$pf_loc/");
    }
    print "copy chunkinfo_10 to ancestry subdir\n" if ($debug);   
    unless (-e "pop_$pf_loc/infosum_pos.chunks_10") {
	&mysystem ("cp infosum_pos.chunks_10 pop_$pf_loc/");
    }
    print "copy chunkinfo_20 to ancestry subdir\n" if ($debug);   
    unless (-e "pop_$pf_loc/infosum_pos.chunks_20") {
	&mysystem ("cp infosum_pos.chunks_20 pop_$pf_loc/");
    }
    
 
    print "copy gwascatalog to ancestry subdir\n" if ($debug);   
    unless (-e "pop_$pf_loc/gwascatalog.$date.rp.txt") {
	&mysystem ("cp gwascatalog.$date.rp.txt pop_$pf_loc/");
    }

    print "copy refGene to ancestry subdir\n" if ($debug);   
    unless (-e "pop_$pf_loc/$refgene_file") {
	&mysystem ("cp $refgene_file pop_$pf_loc/");
    }
    
    
    
    foreach my $chrint ($chr_start..$chr_end){

	my $chr = $chrint;
	
	if ($chrint == 23) {
	    $chr = "X";
	}
	
	$hapout_name = $out_templ.".impute";
	$hapout_name =~ s/XXX/$chr/g;
	my $plink_name = $hapout_name.".plink";	    
	my $bfile_pf = "$plink_name.$pf_loc";
	
	unless (-e "pop_$pf_loc/$bfile_pf.fam") {
	    print "moving $bfile_pf.bed/bim/fam\n" if ($debug);
	    system ("mv $bfile_pf.bed pop_$pf_loc/");
	    system ("mv $bfile_pf.bim pop_$pf_loc/");
	    system ("mv $bfile_pf.fam pop_$pf_loc/");
	}



	################################################
	### copy genetic map file for chrX
	##############################################

	if ($chr == "X"){
	    my $gm_name_source = $gm_template;
	    $gm_name_source =~ s/XXX/X_nonPAR/g;
	    my $gm_name_target = $gm_template;
	    $gm_name_target =~ s/XXX/X/g;
	    
	    unless (-e $gm_name_target){
		&mysystem ("cp $gm_name_source $gm_name_target")
	    }

	    my $gmchr_name_source = $gmchr_template;
	    $gmchr_name_source =~ s/XXX/X_nonPAR/g;
	    my $gmchr_name_target = $gmchr_template;
	    $gmchr_name_target =~ s/XXX/X/g;
	    
	    unless (-e $gmchr_name_target){
		&mysystem ("cp $gmchr_name_source $gmchr_name_target")
	    }

	    
	    my $gmchr_name_source = $gmchr_template_23;
	    $gmchr_name_source =~ s/XXX/X_nonPAR/g;
	    my $gmchr_name_target = $gmchr_template_23;
	    $gmchr_name_target =~ s/XXX/X/g;
	    
	    unless (-e $gmchr_name_target){
		&mysystem ("cp $gmchr_name_source $gmchr_name_target")
	    }



	    
	}

	

	$gm_name = $gm_template;
	$gm_name =~ s/XXX/$chr/g;


	
	unless (-e "pop_$pf_loc/$gm_name") {
	    system ("cp $rootdir/$gm_name pop_$pf_loc/$gm_name");
	}
#	print "$rootdir/$gm_name\n";
	
    }
}
#}
#&mysystem ("touch $rootdir/popdirs_done");



    

#############################################################
## SUCCESSSS
#############################################################

$sjadir = $rootdir;
$sjaname = "finished";
push @sjaarray, "tmp";
$sjatime = 2;
$sjamem = 1000;

    
&send_jobarray;






##################################################################
#### Q16
###################################################################
if ($q16) {
    print "out: $out_templ\n";
    print "postfix: $imp_phased_postfix\n";
    print "q16: $q16\n";
    my $q16_phased = "$q16.chr16_071_074.bgl.phased";
    unless (-e "$q16_phased.markers") {
	print "reformat phased file\n";
	my $phain = "$q16.bgl.phased";
	my $phaout = $q16_phased;

	die $!."($phain)" unless open IN, "< $phain";
	die $!."($phaout)" unless open OUT, "> $phaout";
	foreach (1..$q16_head) {
	    my $line = <IN>; #1
	    print "discard line $_\n";
	}
#	exit;
	while (my $line = <IN>){
	    print OUT "$line";
	}
	close IN;
	close OUT;

	&mysystem ("cp $q16.markers $q16_phased.markers.tmp");
	&mysystem ("mv $q16_phased.markers.tmp $q16_phased.markers");
    }

    my $q16_imp2 = "$q16_phased.hap.gz";
    unless (-e $q16_imp2) {
	&mysystem ("beagle2impute $q16_phased");
    }


    my $q16_bim = "$q16_phased.bim";
    unless (-e $q16_bim) {
	print "Error: $beagle2plink_script needs famfile\n";
	exit;
	&mysystem ("$beagle2plink_script $q16_phased");
    }

    my $q16_info = "$q16_phased.info_pos";
    unless (-e $q16_info) {
	print "refinfo q16\n";
	&mysystem ("refinfo --bgl $q16_phased --annot $q16_bim --bim");
    }


    my $suminfo = "infosum_pos";
    my $suminfo_n = "$suminfo.nsnps";
    my $suminfo_r = "$suminfo.reffiles";
#    my $suminfo_s = "$suminfo.sorted";
    
    print "summary files q16\n";
#    unless (-e "$suminfo_s"){
#	&mysystem ("cat *.info_pos  | grep -v SNP > $suminfo");
#	&mysystem ("sort -k1,1 -u $suminfo > $suminfo_s.tmp");
#	&mysystem ("mv $suminfo_s.tmp $suminfo_s");
 #   }
    
    unless (-e "$suminfo_n"){
	&mysystem ("wc -l *.info_pos > $suminfo_n.tmp");
	&mysystem ("mv $suminfo_n.tmp $suminfo_n");
    }
    
    unless (-e "$suminfo_r"){
	&mysystem ("ls $q16_phased > $suminfo_r.tmp");
	&mysystem ("mv $suminfo_r.tmp $suminfo_r");
	
    }

    my $gema_file = "genetic_map_chr6_combined_b37.txt"; 

    unless (-e $gema_file) {
	&mysystem ("ln -s /fg/debakker_scratch/ripke/hapmap_ref/impute2_ref/1KG_Aug12/ALL_1000G_phase1integrated_v3_impute_macGT1/$gema_file .");
    }



#    unless (-e "sumfrq.eur.sorted"){
#	&mysystem ("cp $suminfo_s sumfrq.eur.sorted.tmp");
#	&mysystem ("mv sumfrq.eur.sorted.tmp sumfrq.eur.sorted");
	
#    }


    &reinvo_b ("everything-is-fine","everythingdone");
    exit;
}

###################################################################
#### MHC
###################################################################
if ($mhc) {
    print "out: $out_templ\n";
    print "postfix: $imp_phased_postfix\n";
    print "mhc: $mhc\n";
    my $mhc_phased = "$mhc.chr6_025_035.bgl.phased";
    unless (-e "$mhc_phased.markers") {
	print "reformat phased file\n";
	my $phain = "$mhc.bgl.phased";
	my $phaout = $mhc_phased;

	die $!."($phain)" unless open IN, "< $phain";
	die $!."($phaout)" unless open OUT, "> $phaout";
	foreach (1..$mhc_head) {
	    my $line = <IN>; #1
	    print "discard line $_\n";
	}
#	exit;
	while (my $line = <IN>){
	    print OUT "$line";
	}
	close IN;
	close OUT;

	&mysystem ("cp $mhc.markers $mhc_phased.markers.tmp");
	&mysystem ("mv $mhc_phased.markers.tmp $mhc_phased.markers");
    }

    my $mhc_imp2 = "$mhc_phased.hap.gz";
    unless (-e $mhc_imp2) {
	&mysystem ("beagle2impute $mhc_phased");
    }


    my $mhc_bim = "$mhc_phased.bim";
    unless (-e $mhc_bim) {
	print "Error: $beagle2plink_script needs famfile\n";
	exit;
	&mysystem ("$beagle2plink_script $mhc_phased");
    }

    my $mhc_info = "$mhc_phased.info_pos";
    unless (-e $mhc_info) {
	print "refinfo mhc\n";
	&mysystem ("refinfo --bgl $mhc_phased --annot $mhc_bim --bim");
    }


    my $suminfo = "infosum_pos";
    my $suminfo_n = "$suminfo.nsnps";
    my $suminfo_r = "$suminfo.reffiles";
#    my $suminfo_s = "$suminfo.sorted";
    
    print "summary files mhc\n";
#    unless (-e "$suminfo_s"){
#	&mysystem ("cat *.info_pos  | grep -v SNP > $suminfo");
#	&mysystem ("sort -k1,1 -u $suminfo > $suminfo_s.tmp");
#	&mysystem ("mv $suminfo_s.tmp $suminfo_s");
#    }
    
    unless (-e "$suminfo_n"){
	&mysystem ("wc -l *.info_pos > $suminfo_n.tmp");
	&mysystem ("mv $suminfo_n.tmp $suminfo_n");
    }
    
    unless (-e "$suminfo_r"){
	&mysystem ("ls $mhc_phased > $suminfo_r.tmp");
	&mysystem ("mv $suminfo_r.tmp $suminfo_r");
	
    }

    my $gema_file = "genetic_map_chr6_combined_b37.txt"; 

    unless (-e $gema_file) {
	&mysystem ("ln -s /fg/debakker_scratch/ripke/hapmap_ref/impute2_ref/1KG_Aug12/ALL_1000G_phase1integrated_v3_impute_macGT1/$gema_file .");
    }


#    unless (-e "sumfrq.eur.sorted"){
#	&mysystem ("cp $suminfo_s sumfrq.eur.sorted.tmp");
#	&mysystem ("mv sumfrq.eur.sorted.tmp sumfrq.eur.sorted");
	
 #   }


    &reinvo_b ("everything-is-fine","everythingdone");
    exit;
}





###################################################################
#### continue with subdir
###################################################################



chdir $subchr_dir or die $!;

## link the sumfrq
foreach my $pf_loc(@pop_fams) {
    unless (-e "sumfrq.$pf_loc") {
	&mysystem ("ln -s $rootdir/sumfrq.$pf_loc .");
    }
    foreach my $chr_loc ($chr_start..$chr_end){
	unless (-e "sumfrq.$pf_loc.$chr_loc.gz") {
	    &mysystem ("ln -s $rootdir/sumfrq.$pf_loc.$chr_loc.gz .");
	}
    }
}


    
#### first link
foreach my $chr ($chr_start..$chr_end){
    my $phased_name = $out_templ.$imp_phased_postfix;
    $phased_name =~ s/XXX/$chr/g;
    my $bgl_name = $out_templ.$imp_phased_postfix.".bgl";
    $bgl_name =~ s/XXX/$chr/g;
    print "phased: $phased_name\n";
#    print "phased: $mach_templ\n";
#    exit;
    unless (-e $phased_name) {
	&mysystem ("ln -s $rootdir/$phased_name .");
    }
    unless (-e $bgl_name.".bim") {
	&mysystem ("ln -s $rootdir/$bgl_name.bim .");
	&mysystem ("ln -s $rootdir/$bgl_name.bed .");
	&mysystem ("ln -s $rootdir/$bgl_name.fam .");
    }

    my $gema_file = "genetic_map_chr$chr"."_combined_b37.txt";    
    unless (-e "$rootdir/$gema_file") {
	$gema_file = "genetic_map_chr$chr"."_combined_b36.txt";
	unless (-e "$rootdir/$gema_file") {
	    print "Error: $gema_file not existing\n";
	    exit;
	}
    }
    unless (-e $gema_file) {
	&mysystem ("ln -s $rootdir/$gema_file .");
    }

}


my @job_r2s = ();
my @job_b2i = ();
my @job_b2p = ();
my @job_preps = ();
#### 
foreach my $chr ($chr_start..$chr_end){

    #### ref2subchr
#my.1000GP_Phase3_chr22.impute.phased.bgl.fam

#sc_my.1000GP_Phase3_chr1_000_003_1000.impute.phased.bgl


    my $phased_name = $out_templ.$imp_phased_postfix;
    $phased_name =~ s/XXX/$chr/g;
    my $chr_str = "chr$chr"."_";
    my @sc_files = `ls sc*$chr_str*.phased`;
#    print "chr$chr: @sc_files\n";

    my $fam_name = $out_templ.$imp_phased_postfix.".bgl.fam";
    $fam_name =~ s/XXX/$chr/g;
#    print "fam_name: $fam_name\n";
#    exit;
    
    unless (-e "$phased_name.done"){
#    if (@sc_files == 0){
	push @job_r2s, "$ref2subchr2_script --scsize $scsize $phased_name";
    }

    foreach my $scf (@sc_files){
	chomp ($scf);
	my $infopos_file = "$scf.bgl.info_pos";
	unless (-e $infopos_file) {
	    push @job_preps, "$prepare_hm_ref_2_script --not --noaffy $scf";
	}

    }

    foreach my $scf (@sc_files){
	chomp ($scf);
	my $imp2_file = "$scf.bgl.hap.gz";
#	print "$imp2_file\n";;
	unless (-e $imp2_file) {
	    push @job_b2i, "$beagle2impute_script $scf.bgl";
#	    print "beagle2impute $scf.bgl\n";
	}

    }

    foreach my $scf (@sc_files){
	chomp ($scf);
	my $bim_file = "$scf.bgl.bim";
#	print "$imp2_file\n";;
	unless (-e $bim_file) {

	    push @job_b2p, "$beagle2plink_script --fam $fam_name  $scf.bgl";
#	    print "beagle2impute $scf.bgl\n";
	}

    }



}



if (@job_r2s > 0 ) {


    $sjaweek = 1;
    $sjadir = $subchr_dir;
    $sjaname = "ref2sub";
    $sjatime = 2;
    $sjamem = 4000;
    @sjaarray = @job_r2s;
    
    &send_jobarray;



}
    

if (@job_preps > 0 ) {


    $sjadir = $subchr_dir;
    $sjaname = "preps";
    $sjatime = 1;
    $sjamem = 2000;
    @sjaarray = @job_preps;
    
    &send_jobarray;



}
    
if (@job_b2i > 0 ) {

    $sjadir = $subchr_dir;
    $sjaname = "b2i";
    $sjatime = 1;
    $sjamem = 2000;
    @sjaarray = @job_b2i;
    
    &send_jobarray;



}

if (@job_b2p > 0 ) {


    $sjadir = $subchr_dir;
    $sjaname = "b2p";
    $sjatime = 1;
    $sjamem = 2000;
    @sjaarray = @job_b2p;
    
    &send_jobarray;




}



    
my $suminfo = "infosum_pos";
my $suminfo_n = "$suminfo.nsnps";
my $suminfo_r = "$suminfo.reffiles";

unless (-e "$suminfo_n"){
    &mysystem ("wc -l *.info_pos > $suminfo_n.tmp");
    &mysystem ("mv $suminfo_n.tmp $suminfo_n");
}

unless (-e "$suminfo_r"){
    &mysystem ("ls sc_*.bgl > $suminfo_r.tmp");
    &mysystem ("mv $suminfo_r.tmp $suminfo_r");

}

#############################################################
## SUCCESSSS
#############################################################

$sjadir = $rootdir;
$sjaname = "finished";
push @sjaarray, "tmp";
$sjatime = 1;
$sjamem = 1000;

    
&send_jobarray;



exit;

