#!/usr/bin/perl
use strict;

my $version = "1.0.0";
my $progname = $0;
$progname =~ s!^.*/!!;
my $command_line = "$progname @ARGV";

my $walltime = 1;

my $info_txt = "";
my $rootdir = "";

my $disname = "";
my $popname = "eur";
my $job_bn_th = 1000;



#### version
## 11   with rep_qc2_14 (sex-check on SNPs after pre-geno)

#############################
# read config file
#############################

my $conf_file = $ENV{HOME}."/ricopili.conf";
my %conf = ();

die $!."($conf_file)" unless open FILE, "< $conf_file";
while (my $line = <FILE>){
    my @cells = split /\s+/, $line;
    $conf{$cells[0]} = $cells[1];
}
close FILE;

sub trans {
    my ($expr)=@_;
    unless (exists $conf{$expr}) {
	die "config file without entry: $expr\n";
    }
    $conf{$expr};
}

my $ploc = &trans("ploc");
my $homedir = &trans("home");
my $qloc = &trans("queue");
my $init = &trans("init");
my $email = &trans("email");
my $loloc = &trans("loloc");


my $mind=0.02; # per ID	
my $geno=0.02; # per SNP
my $maf_th=0.00; # per SNP
my $midi_th=0.02; # per SNP
my $pre_geno=0.05; # per SNP
my $mend_th=4; # per SNP
my $imend_th=10000; # per ID
my $Fhet_th=.2; # per ID
my $hwe_th=0.000001; 
my $hwe_th_ca=0.0000000001; 
my $withpna=0;
my $noassoc = 0;

##### help message
my $usage = "
Usage : $progname [options]

version: $version


 --help            print this text and exits
 --disease STRING  disease prefix
 --trio            for trio data.

 --popname         popname (eur, aam, or others)

 --prekno FILE    prekno-file
# --namefile STRING file containing files for datasets, will be created if not named


  --force1            do not exit if same fail, but do this only once

  --outname STRING  identifier for pipiline run (obligatory!!)

  --serial          no sending jobs to queue all in one run
                           -> usually only used for testing 

  --sleep INT       sleep for INT seconds, try this if you think there is a race condition
                           (hints: stops a different steps, --serial works)
                           try using 5 seconds first.

default thresholds:


   mind       $mind
   geno       $geno
   maf        $maf_th
   midi       $midi_th
   pre-geno   $pre_geno
   lmend      $mend_th
   imend      $imend_th
   Fhet_th    $Fhet_th
   hwe_th_co  $hwe_th
   hwe_th_ca  $hwe_th_ca
   withpna    $withpna


options to change these values:

  --mind NUM        include only IDs with missing-rate < NUM
  --geno NUM        include only SNPs with missing-rate < NUM
  --maf  NUM        include only SNPs with missing-rate < NUM
  --midi NUM        include only SNPs with missing-rate-difference < NUM
  --pre_geno NUM    include only SNPs with missing-rate < NUM (before ID filter)
  --imend INT       number of mendelian errors per ID
  --lmend INT       max number of mendelian errors per SNP
  --Fhet_th NUM     -NUM < FHET < NUM
  --hwe_th_co NUM   HWE_controls < NUM
  --hwe_th_ca NUM   HWE_cases < NUM
  --withpna 1       inlcude SNPs with p=NA (monomorph)


 created by Stephan Ripke 2011 at MGH, Boston, MA
 
";






my $prekno_file = "";

use Getopt::Long;
GetOptions( 

    "trio"=> \my $trio,
    "serial"=> \my $serial,
    "sleep=i"=> \my $sleep_sw,
    "help"=> \my $help,
    "disease=s"=> \$disname,
    "popname=s"=> \$popname,

    "prekno=s"=> \$prekno_file,
    "force1"=> \my $force1,
    "outname=s"=> \my $outname,


    "mind=s"=> \$mind,  # missing per ID threshhold
    "geno=s"=> \$geno,  # missing per SNP threshhold
    "maf=s"=> \$maf_th,  # MAF per SNP threshhold
    "midi=s"=> \$midi_th,  # missing per SNP threshhold
    "pre_geno=f"=> \$pre_geno,  # pre_geno threshold
    "imend=i"=> \$imend_th,  # mendel per ID threshhold
    "lmend=i"=> \$mend_th,  # mendel per SNP threshhold
    "Fhet_th=f"=> \$Fhet_th,  # Fhet threshold
    "hwe_th_co=f"=> \$hwe_th,
    "hwe_th_ca=f"=> \$hwe_th_ca,
    "withpna=i"=> \$withpna, # mono threshold





    );

if ($sleep_sw) {
  print "sleeping for $sleep_sw seconds (only use if suspect of race condition)\n";
  sleep ($sleep_sw);
}


if ($hwe_th == 0 || $hwe_th_ca == 0) {
    print "please do not use 0 for hwe thresholds, use 1.0e-300\n";
}

############################################################
## testing binaries
##############################################################
my @test_scripts;
my $plague_script = "plague_2";        ### my.pipeline_tar
my $qc_script = "rep_qc2_14";          ### my.pipeline_tar
my $tager_script = "id_tager_2";       ### my.pipeline_tar
my $link_script = "my.linksub";        ### my.pipeline_tar
my $excel_script = "txt2xls";          ### my.pipeline_tar
my $blue_script = "blueprint";         ### my.pipeline_tar
my $gwa_script = "gwa_plot_3";         ### my.pipeline_tar
my $shrink_script = "shrinkpdf";       ### my.pipeline_tar
my $plothist_script = "plothist2";      ### my.pipeline_tar
my $qqplot_script = "qqplot_5";        ### my.pipeline_tar
my $lahunt_script = "lahunt_9";        ### my.pipeline_tar
my $pdflatex_script = "pdflatex";      ### my.pipeline_tar
my $mystart_script = "my.start_job";   ### my.pipeline_tar

push @test_scripts,  $plague_script;
push @test_scripts,  $qc_script;
push @test_scripts,  $tager_script;
push @test_scripts,  $link_script;
push @test_scripts,  $excel_script;
push @test_scripts,  $blue_script;
push @test_scripts,  $gwa_script;
push @test_scripts,  $shrink_script;
push @test_scripts,  $plothist_script;
push @test_scripts,  $qqplot_script;
push @test_scripts,  $lahunt_script;
push @test_scripts,  $pdflatex_script;
push @test_scripts,  $mystart_script;

my $mutt_script = "mutt";

print ".......testing necessary binaries....\n";
my @miss_scripts;


#my $err_scr = 0;
foreach my $scr_name (@test_scripts) {
    my $scr_path = '';
    
    for my $path ( split /:/, $ENV{PATH} ) {
	if ( -f "$path/$scr_name" && -x _ ) {
	    print "$scr_name\tfound in $path\n";
	    $scr_path = "$path/$scr_name";
	    last;
	}
    }
    if ( $scr_path eq  '') {
	push @miss_scripts, "cp /home/unix/sripke/bin/$scr_name ./\n";
	print "!!Error!! : No $scr_name command available\n" ;
    }
 
}



if (@miss_scripts > 0) {
  if (-e "get_scripts_on_broad.txt") {
    print "please remove this file and restart: get_scripts_on_broad.txt\n";
  }
  die $! unless open FILE1, "> get_scripts_on_broad.txt";
  foreach (@miss_scripts) {
    print FILE1 "$_";
  }
  close FILE1;


  print "exiting now -> have a look at get_scripts_on_broad.txt\n";
  exit;

}








print ".......testing email program....\n";

my $err_scr = 0;
{
    my $scr_path = '';
    
    for my $path ( split /:/, $ENV{PATH} ) {
	if ( -f "$path/$mutt_script" && -x _ ) {
	    print "$mutt_script\tfound in $path\n";
	    $scr_path = "$path/$mutt_script";
	    last;
	}
    }
    unless ( $scr_path ) {

	print "!!Warning!! : No $mutt_script command available, trying mail\n" ;

	$mutt_script = "mail";
	for my $path ( split /:/, $ENV{PATH} ) {
	    if ( -f "$path/$mutt_script" && -x _ ) {
		print "$mutt_script\tfound in $path\n";
		$scr_path = "$path/$mutt_script";
		last;
	    }
	}
	unless ( $scr_path ) {
	    $err_scr = 1;
	    print "!!Error!! : No $mutt_script command available\n" ;
	}
    }
 
}
die if $err_scr == 1;


print "....all necessary binaries found....\n";
print "------------------------------------\n";


#####################################
# "testing environment variable rp_perlpackages
####################################

print "testing environment variable rp_perlpackages....\n";
unless (exists $ENV{rp_perlpackages}) {
    print "Error: no environment variable for perl-packages, please re-install ricopili and make sure to follow all instructions\n";
    print "------------------------------------\n";
    exit;
}
print "....all set....\n";
print "------------------------------------\n";








die $usage if $help;
die $usage if $disname eq "";

unless ($outname) {
    print "#################################\nplease give name for thie pipeline run: --outname SOMETHING\n";
    exit;
}

if (length($disname)!=3) {
    print "Error: please use only three_letter disease\n";
    exit;
}

if (length($init)!=2) {
    print "Error: please use only two_letter initials\n";
    exit;
}

if ($init eq "NA") {
    print "Error: please name your own initials in config file: $conf_file\n";
    exit;
}


my $prekno_txt = "";
unless ($prekno_file eq "") {
    $prekno_txt = "--prekno $prekno_file";
}

my $trio_txt = "";
$trio_txt = "--trio" if ($trio);

my $tdt_txt = "";
$tdt_txt = "--tdt" if ($trio);



#push @scripts,"id_tager_3";
##################################################################
#exit;

#####################################
# print array to file
####################################

sub a2file {
    my ($file, @lines)=@_;
    die $! unless open FILE, "> $file";
    foreach (@lines){
	print FILE $_;
    }
    close FILE;
}


###################################################
###  system call with test if successfull
###################################################
sub mysystem(){
    my ($systemstr)="@_";
    system($systemstr);
    my $status = ($? >> 8);
    die "$systemstr\n->system call failed: $status" if ($status != 0);
}


##########################################
# subroutine to split a plink-output-line
##########################################

sub split_line {
    my ($line)=@_;
    chomp($line);
    $line =~ s/^[\s]+//g;
    my @cols=  split /\s+/, $line;
}



#####################################
# print array to file with newline
####################################

sub a2filenew {
    my ($file, @lines)=@_;
    die $! unless open FILE, "> $file";
    foreach (@lines){
	print FILE "$_\n";
    }
    close FILE;
}


#####################################
# append array to file with newline
####################################

sub a2filenew_app {
    my ($file, @lines)=@_;
    die "$!: $file" unless open FILE, ">> $file";
    foreach (@lines){
	print FILE "$_\n";
    }
    close FILE;
}

#####################################
# subroutine to count lines of a file
#####################################

sub count_lines {
    my ($file)=@_;
    my $lc=0;
    die "$file: ".$! unless open FILE, "< $file";
    while (<FILE>){
	$lc++;
    }
    close FILE;
    $lc;
}



#####################################
# subroutine to re-invoke this script
#####################################

sub reinvo_b {
    my ($message, $wt_file)=@_;
    my $now = localtime time;
    my $old_cmd = `tail -2 $loloc/preimp_dir_info | head -1`;

    my $end = 0;
    $end =1 if ($message eq "success");

    my $message_part = $info_txt."\t$message";
    $message = $info_txt."\t$message\t$now";

    &a2filenew_app("$loloc/preimp_dir_info",$message);
#    print "$message\n";
    if ($end==1){
#	print "yeag\n";
	exit;
    }

    die "2 times already" if ($old_cmd =~ /$message_part/);
    chdir "$rootdir" or die "something strange";
    if ($qloc eq "bsub") {
	$wt_file =~ s/.*blueprint_joblist_file-//;
    }

    &mysystem ("$blue_script -b \"$command_line\" --wa 2 --di -j --fwt $wt_file --na _pri_$disname");
    exit;

}




#####################################
# send jobs to cluster and also send navi again
#####################################

my $sjadir = "";
my $sjaweek = 0;
my $sjaname = "";
my $sjarow = "";
my @sjaarray;
my $sjamem = 0;
my $sjatime = -1;
my $sjamaxjobs = 30000;
my $sjainfofile = "$loloc/preimp_dir_info";
unless (-e $sjainfofile) {
    print "log-file ($sjainfofile) is not existing\n";
    print "please check loloc in ~/ricopili.conf\n";
    exit;
}
my $sjainfotxt = "";
my $sjamulti = 0;


sub send_jobarray {

    die "send_jobarray with undefined variables, dir" if ($sjadir eq "");
    die "send_jobarray with undefined variables, name" if ($sjaname eq "");
    die "send_jobarray with undefined variables, array" if (@sjaarray == 0);
    die "send_jobarray with undefined variables, mem" if ($sjamem == 0);
    die "send_jobarray with undefined variables, time" if ($sjatime < 0);
    die "send_jobarray with undefined variables, info" if ($sjainfotxt eq "");


    my $now = localtime time;
    $now =~ s/ /_/g;


    if ($sjaname eq "finished") {

	my $fini_message ;
	$fini_message .= "\n\n##################################################################\n";
	$fini_message .= "##### CONGRATULATIONS: \n";
	$fini_message .= "##### preimp_pipeline finished successfully:\n";
	$fini_message .= "##### $sjainfotxt\n";
	$fini_message .= "##### now start with imputation pipeline (see README in subdir qc/)\n";
	$fini_message .= "##### have a look at the wiki page\n"; 
	$fini_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n";
	$fini_message .= "##################################################################\n";
	print "$fini_message\n";

	
	die $! unless open SUC, "> success_file";
	print SUC $fini_message."\n";
	close SUC;

	&mysystem ('cat success_file | '.$mutt_script.' -s RP_preimp_finished '.$email) ;

	my $sjarow      = $sjainfotxt."\t$sjaname\t$now";
	&a2filenew_app("$sjainfofile",$sjarow);


	exit;

    }





    chdir ($sjadir);
    my $jobfile = "$sjaname.job_list";
    while (-e $jobfile) {
	$jobfile .= ".s";
    }
    &a2filenew ($jobfile, @sjaarray);

#    $walltime = $sjatime;
    my $nsja = @sjaarray;

    my $nsja_loc = $nsja;
    if ($nsja_loc > 30000) {
	$nsja_loc = 30000;
    }

    my $multi_txt = "";
    if ($sjamulti > 0) {
	$multi_txt = "--multi $nsja_loc,$sjamulti";
    }

    ### with array

    my $sja_week_str = "";
    if ($sjaweek > 0) {
	$sja_week_str = "--week 1";
    }



    if ($serial) {
	print "starting step $sjaname with ".@sjaarray." jobs\n";
	print "please be patient.\n";
	my $jc = 1;
	foreach (@sjaarray) {
	    print "running job $jc...\n";
	    &mysystem($_);
	    $jc++;
	    
	}
    }
    else {    
	my $sys_loc = "$blue_script $sja_week_str --noerr --njob $nsja_loc --array $jobfile --wa $sjatime --mem $sjamem --j --na $sjaname.$outname $multi_txt";
	print "$sys_loc\n";
	#    sleep(3);
	&mysystem ($sys_loc);
    }



    my $old_cmd = `tail -1 $sjainfofile | head -1`;

    my $sjacontent = "$sjaname.".@sjaarray;

    my $sjarow_part = $sjainfotxt."\t$sjacontent";
    my $sjarow      = $sjainfotxt."\t$sjacontent\t$now";
#    $message = $info_txt."\t$message\t$now";

    &a2filenew_app("$sjainfofile",$sjarow);

    if ($old_cmd =~ /$sjarow_part/){
	unless ($force1 ){
	    my $err_message ;
	    $err_message .= "##################################################################\n";
	    $err_message .= "##### Error: \n";
	    $err_message .= "##### step $sjaname has been done repeatedly without any progress\n";
	    $err_message .= "##### preimp pipeline stopped\n";

	    $err_message .= "##### $sjainfotxt\n";
	    $err_message .= "##### if reason does not appear obvious\n";
	    $err_message .= "##### have a look at the wiki page\n"; 
	    $err_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n";
	    $err_message .= "##### or contact the developers\n";
	    $err_message .= "##################################################################\n";
	    print "$err_message\n";

	    die $! unless open ERR, "> error_file";
	    print ERR $err_message."\n";
	    close ERR;


	    &mysystem ('cat error_file | '.$mutt_script.' -s RP_preimp_error '.$email) ;

	    unless ($serial) {
		exit;
	    }
	}

    }


    $command_line =~ s/--force1//;


    my $wt_file = "$sjadir/blueprint_joblist_file-$sjaname.$outname";
    chdir "$rootdir" or die "something strange";
    if ($qloc eq "bsub" || $qloc eq "msub") {
	$wt_file =~ s/.*blueprint_joblist_file-//;
    }
    if ($qloc eq "slurm") {
	$wt_file = "$sjadir/$jobfile.script.id";
    }
    if ($qloc eq "qsub") {
	$wt_file = "$sjadir/j.$sjaname.$outname.id";
    }
    if ($qloc eq "qsub_c") {
	$wt_file = "$sjadir/j.$sjaname.$outname.id";
    }
    if ($qloc eq "qsub_b") {
	$wt_file = "$sjadir/j.$sjaname.$outname.id";
    }
    


    if ($serial) {
	my $sys_re = "$command_line";
	&mysystem ($sys_re);
	exit;
    }
    else {
	my $sys_re = "$blue_script --njob $job_bn_th -b \"$command_line\" --wa 4 --di -j --fwt $wt_file --na _pr_$outname";
	print "$sys_re\n";
	&mysystem ($sys_re);
    }
    



    print "------------------------------------------------------------\n";
    print "$nsja jobs successfully submitted\n";
    print "please see tail of $sjainfofile for regular updates\n";
    print "also check bjobs -w for running jobs\n";
    print "possibly differnt command on different computer cluster: e.g. qstat -u USER\n";
    print "you will be informed via email if errors or successes occur\n";
    print "------------------------------------------------------------\n";

    exit;



}






##############################################
##############################################
#############  BEGIN
##############################################
##############################################



use Cwd;
use File::Path;
$rootdir = &Cwd::cwd();
$sjainfotxt = "$rootdir\t$command_line";


$info_txt = "command:\t\"$command_line\"\tdir:\t$rootdir";



my @files = ();
opendir(DIR, ".") || die "can't opendir .: $!";
@files = readdir(DIR);
closedir DIR;

my $namefile = "$disname.names";


### read bim-files
my @bim_files = grep {/bim$/} @files;
my @bim_files = grep {!/-qc.bim$/} @bim_files;


#print "bim: @files\n";
#print "bim: @bim_files\n";
#exit;



my %name_hash;
my @name_arr;
#my @bim_tmp;

##############################################
#### create namefile-template
##############################
unless (-e $namefile) {
    die $! unless open FILE, "> $namefile";
    my @qc_arr;
    my $cc = 1;
    foreach (@bim_files) {
	my $bprefix = $_;
	$bprefix =~ s/.bim$//;
	print FILE "$disname$cc $bprefix\n";
	$cc++;
    }
    close FILE;
    print "please edit $namefile\n";
    exit;
}
else {
    die "$!:$namefile" unless open NF, "< $namefile";   
    while (<NF>) {
	my @cells = &split_line($_);
	$name_hash{$cells[1]} = $cells[0];
	push @name_arr, $cells[0];
    }
    close NF;
}
#exit;
#print "bim: @bim_files\n";
#exit;

##############################################
#### clean bimfile - list
##############################

my @bim_files_clean;

foreach my $bf (@bim_files) {
    my $bimprefix = $bf;
    my $bprefix = $bf;
    $bprefix =~ s/.bim$//;
    unless (exists $name_hash{$bprefix}){
	print "$bprefix does not have a name and will not go on further\n"; 
    }
    else {
	push @bim_files_clean, $bimprefix ;
    }
}

@bim_files = @bim_files_clean;

my $qc_dir= "qc";

my @created = mkpath(   ## $created ?
			$qc_dir,
			{verbose => 0, mode => 0750},
    );
	


#############################################
#### check famfiles
#############################################
print "\n";
foreach my $bf (@bim_files) {
    my $bprefix = $bf;
    $bprefix =~ s/.bim$//;
    my $famfile = "$bprefix.fam";
    if (-e "$famfile.nocaco")  {
	&mysystem ("rm $famfile.nocaco");
    }
#    print "\ntesting $famfile\n";

    my %count_pt;
    my %id_hash;
    my $dup_id=0;
    my @dedup_arr;

    die "$!:$famfile" unless open FF, "< $famfile";   
    while (<FF>) {
	my @cells = &split_line($_);
	$count_pt{$cells[5]}++;

	### check for existing IDs
	my $id = "$cells[0]\t$cells[1]";
	while (exists $id_hash{$id}) {
	    $id .= ".d";
	    print "$id\n";
	    $dup_id = 1;
	}
	$id_hash{$id} = 1;
	push @dedup_arr, "$id\t$cells[2]\t$cells[3]\t$cells[4]\t$cells[5]";

    }
    close FF;

    if ($dup_id == 1) {

	die "$!:$famfile.noov" unless open FO, "> $famfile.noov";   
	foreach (@dedup_arr) {
	    print FO $_."\n";
	}

	close FO;
	print "***************************************\n";
	print "Error: found duplicated IDs in $famfile\n";
	print "either fix it manually or copy over prepared file with these two commands\n";
	print "\ncp $famfile $famfile.bak\n";
	print "cp $famfile.noov $famfile\n";
	print "\nthen rerun pipeline\n";
	print "***************************************\n";
	exit;
    }

#    foreach (keys %count_pt) {
#	print "phenotype $_, N = $count_pt{$_}\n";
#    }

    if ($count_pt{1} > 10 && $count_pt{2} > 10){
#	print "more than 10 cases and more than 10 controls, so we are good\n";
    }
    else {
#	print "less than 10 cases and/or less than 10 controls, so keep as monomorphic phenotype\n\n";
	print "\nWarning:\n";
	print "--------\n";
	print "$famfile: less than 10 cases and/or less than 10 controls, so will do only restricted QC (without association)\n\n";
	&mysystem ("touch $famfile.nocaco");

#	$noassoc = 1;
    }


}

#exit;


#############################################
#### plague
#############################################

unless (-e "$disname.tag.done") {
    my @pla_arr;
    my %platform_hash;
    foreach my $bf (@bim_files) {
	my $bimprefix = $bf;
	my $bprefix = $bf;
	$bprefix =~ s/.bim$//;
	unless (-e "$bimprefix.plague") {
	    push @pla_arr, "$plague_script $bimprefix > $bimprefix.plague.tmp; mv $bimprefix.plague.tmp $bimprefix.plague" ;
	}
	else {
	    my $platform = "";
	    my $sumper_max = 0;
	    die "$!:$bimprefix.plague" unless open PL, "< $bimprefix.plague";   
	    while (<PL>) {
		my @cells = &split_line($_);
		my $sumper = $cells[7] + $cells[14];
#	    print "$sumper\t$cells[7]\t$cells[14]\n";
		if ($sumper > $sumper_max) {
		    $platform = $cells[2];
		    $sumper_max = $sumper;
		}
	    }
	    close PL;
	    $platform =~ s/.*_//;
	    $platform_hash{$bprefix} = $platform;
	    if ($platform eq "") {
		print "Error, something went wrong with plague\n";
		print "please remove $bimprefix.plague and restart pipeline\n";
	    }
	    else {
		print "$bimprefix platform: <$platform>\n";
	    }
#	    print "s:$sumper_max\tp:$platform\tb:$bimprefix\n";
#	    exit;
	}
    }
    




    
    if (@pla_arr > 0) {


	$sjadir = $rootdir;
	$sjaname = "plague";
	$sjatime = 1;
	$sjamem = 2000;
	@sjaarray = @pla_arr;
	
	&send_jobarray;


#	&a2filenew ("plague_job_list", @pla_arr);
#	my $nda = @pla_arr;
#	&mysystem ("cat plague_job_list | $blue_script -b \"prefix\" --wa 1 -j --na plague_$disname");
#	&reinvo_b ("plague_$nda","blueprint_joblist_file-plague_$disname");
    }
    
    
    
    
#############################################
#### id_tager
#############################################
    
#id_tager --create --nn ihac_us2_eur_A60 --cn ihac_us2_eur IHAC.Affy.Amer.C2.v3.MERGED.KEEPERS.QC.fam
    
#print "bim: @bim_files\n";
#exit;
    
    my @it_arr;
    foreach (@bim_files) {
#    print "bim: $_\n";
	my $bprefix = $_;
	$bprefix =~ s/.bim$//;
	die "$bprefix does not have a name" unless (exists $name_hash{$bprefix});
	die "$bprefix does not have a platform" unless (exists $platform_hash{$bprefix});
	my $nn = "$disname";
	$nn .= "_".$name_hash{$bprefix};
	$nn .= "_".$popname;
	$nn .= "_".$init;
	my $cname = $nn;
	$nn .= "_".$platform_hash{$bprefix};
	push @it_arr, "$tager_script $trio_txt --create --nn $nn --cn $cname $bprefix.fam" unless (-e "$qc_dir/$cname.bim");

#	print "test $bprefix: $bprefix.fam.nocaco\n";
#	sleep(2);
	if (-e "$bprefix.fam.nocaco") {
	    &mysystem ("touch $cname.nocaco");
	}
    }
    
    
    if (@it_arr > 0) {
	&a2filenew ("it_job_list", @it_arr);
#    exit;
	foreach my $it_sys (@it_arr) {
	    &mysystem ($it_sys);
	}
#	my $nda = @it_arr;
#	&mysystem ("cat it_job_list | $blue_script -b \"prefix\" --start -j ");
	&mysystem ("touch $disname.tag.done");
    }
}

chdir ($qc_dir);


my @bfiles_pi = ();
foreach my $name (@name_arr) {

    my $nn = "$disname";
    $nn .= "_".$name;
    $nn .= "_".$popname;
    $nn .= "_".$init;
    my $cname = $nn;
#    print $cname."\n";
    die "$cname.bim is not existing, maybe remove $disname.tag.done" unless (-e $cname.".bim");
    push @bfiles_pi, $cname;
#    $nn .= "_".$platform_hash{$bprefix};
#    push @it_arr, "id_tager --create --nn $nn --cn $cname $bprefix.fam" unless (-e "$cname.bim");
}
#exit;



##############################################
#### QC
##############################
my @qc_arr;
my $qc_fini = 0;
my @meta_files;
print "\n------------------------\n";
print "starting QC scripts\n";
foreach my $bprefix (@bfiles_pi) {
#    print "$bprefix-qc.bim\n";

#    if ($trio) {
#	unless (-e "$bprefix-qc.bim") {
#	    push @qc_arr, "$qc_script $tdt_txt $prekno_txt $bprefix" ;
#	}
#	else {
#	    push @meta_files, "$bprefix-qc.meta" ;
#	}
 #   }
  #  else {
    my $option_txt;

#    print "$bprefix: $rootdir/$bprefix.nocaco\n";
#    sleep(5);
    if (-e "$rootdir/$bprefix.nocaco") {
	print "$bprefix with limited QC (no association)\n";
	$option_txt .= " --noassoc";
    }
    $option_txt .= " --mind $mind";
    $option_txt .= " --geno $geno";
    $option_txt .= " --maf $maf_th";
    $option_txt .= " --midi $midi_th";
    $option_txt .= " --pre_geno $pre_geno";
    $option_txt .= " --imend $imend_th";
    $option_txt .= " --lmend $mend_th";
    $option_txt .= " --Fhet_th $Fhet_th";
    $option_txt .= " --hwe_th_co $hwe_th";
    $option_txt .= " --hwe_th_ca $hwe_th_ca";
    $option_txt .= " --withpna $withpna";


    unless (-e "$bprefix-qc.fini") {
	push @qc_arr, "$qc_script $option_txt $tdt_txt $prekno_txt $bprefix" ;
    }
    else {
	push @meta_files, "$bprefix-qc.meta" ;
	$qc_fini++;
    }
   # }
}
#exit;


if (@qc_arr > 0) {


    $sjadir = "$rootdir/$qc_dir";
    $sjaname = "qc";
    $sjatime = 2;
    $sjatime = 4 if ($qc_fini > 0);
    $sjamem = 2000;
    @sjaarray = @qc_arr;
    
    &send_jobarray;



#    &a2filenew ("qc_job_list", @qc_arr);
#    my $nda = @qc_arr;
#    &mysystem ("cat qc_job_list | $blue_script -b \"prefix\" --mem 3000 --i 8,1 --wa 4 -j --na qc_$disname");
#    &reinvo_b ("qc_$nda","$qc_dir/blueprint_joblist_file-qc_$disname");
}


##############################################
#### prepare imputation
##############################
my $imp_dir= "imputation";	


my @created = mkpath(   ## $created ?
			$imp_dir,
			{verbose => 0, mode => 0750},
    );

chdir ($imp_dir);


foreach my $bprefix (@bfiles_pi) {
    &mysystem ("$link_script ../$bprefix-qc.bed") unless (-e "$bprefix-qc.bed");
    &mysystem ("$link_script ../$bprefix-qc.bim") unless (-e "$bprefix-qc.bim");
    &mysystem ("$link_script ../$bprefix-qc.fam") unless (-e "$bprefix-qc.fam");
}

unless (-e "README") {
    die $!." <README>" unless open RD, "> README";
    print RD "standard imputation start:\n";
    print RD "impute_dirsub_[VERSION] --phase 91 --out OUTNAME\n";
    close RD;
}


#&reinvo_b ("success","blueprint_joblist_file-plague_$disname");

#############################################################
## SUCCESSSS
#############################################################

$sjadir = $rootdir;
$sjaname = "finished";
push @sjaarray, "tmp";
$sjatime = 1;
$sjamem = 1000;

    
&send_jobarray;


exit;

##############################################
#### meta
##############################


my @ol;
my @olt;
my $mf = $meta_files[0];
die $!." <$mf>" unless open IN, "< $mf";
while (my $line = <IN>){
    my @cells = &split_line($line);
    push @ol, "$cells[0]";
    $olt[0] .= "\t$cells[0]";
}
close IN;

#print "@meta_files\n";
my $olt_c = 1;
foreach my $mf (@meta_files) {
    my $cc = 0;
    die $!." <$mf>" unless open IN, "< $mf";
    while (my $line = <IN>){
	my @cells = &split_line($line);
	$ol[$cc] .= "\t$cells[1]";

	$cc++;
	$olt[$olt_c] .= "\t$cells[1]";
    }
    close IN;
    $olt_c++;
}


die $! unless open OUT, "> $disname.metaqc";
foreach (@ol) {
    print OUT "$_\n";
}
close OUT;

die $! unless open OUT, "> $disname.metaqc.t";
foreach (@olt) {
    print OUT "$_\n";
}
close OUT;

&mysystem("$excel_script --cogr 1,2,4,8,12,14,17,23,25,28,31,34 --pcol 31,32,33 --txt $disname.metaqc.t  --xls $disname.metaqc.t.xls");
&mysystem("echo $excel_script --cogr 1,2,4,8,12,14,17,23,25,28,31,34 --pcol 31,32,33 --txt $disname.metaqc.t  --xls $disname.metaqc.t.xls > start_xls");



my $qc_txt = '

i. Cohorts and samples
Four wave collections with genome-wide SNP genotype data were used in this analysis (Supplementary Table 1, tab 1), containing a total of 5342 cases and 6509 controls. Three different chips were used: two produced by Affymetrix (the GeneChip Human Mapping 500K Array and the Genome-Wide Human SNP Array 6.0) and one produced by Illumina (Omni-Express). 

ii. QC and imputation
Technical quality control was performed on genotypes gen¬erated by various GWAS platforms, with quality control conducted on each dataset separately using a common approach. The following quality control parameters were applied: (i) missing rate per SNP < 0.05 (before sample removal below), (ii) missing rate per individual < 0.02, (iii) heterozygosity per individual (Fhet) +/- 0.2, (iv) missing rate per SNP < 0.02 (after sample removal above), (iv) missing rate per SNP difference in cases and controls < 0.02, (vi) Hardy-Weinberg equilibrium (controls) P < 10−6, (vii) Hardy-Weinberg equilibrium (cases) P < 10−10. Study sample sizes after quality control varied between 435 and 4,418 individuals (Supplementary Table 1, tab 1). The number of SNPs per study after quality control varied between 390,000 and 750,000. On average, the quality control processes excluded 67 individuals per study (with a range of 29–163 individuals) and 116,000 SNPs per study (with a range of 91,000–159,000 SNPs). These numbers are slightly higher than usually observed in meta-analysis, since we decided to start with completely uncleaned genotype-datasets.

After quality control, the GWAS datasets together comprised 11,584 individuals and, for the next steps of the ‘genetic quality control’ analysis, a set of 77,986 SNPs common to all platforms and successfully genotyped in each GWAS sample was extracted. These SNPs were then further pruned to remove LD (leaving no pairs with r2 > 0.05) and lower frequency SNPs (minor allele frequency < 0.05), leaving 39,239 SNPs suitable for robust relatedness testing and population structure analysis (see below).

Imputation of untyped SNPs was performed within each study with prephasing/imputation stepwise approach. We used Impute2/Shapeit (http://mathgen.stats.ox.ac.uk/impute/impute_v2.html, http://www.shapeit.fr/). Imputation was performed with the complete world-wid HapMap phase 3 data (http://mathgen.stats.ox.ac.uk/impute/data_download_hapmap3_r2.html), containing 2,022 haplotypes, using a chunk size of 5Mb using default parameters. λ was carefully monitored before and after imputation. 

Genetic quality control included relatedness test¬ing and principal components analyses based on 39,239 LD independent SNPs, present on all platforms in this study. Relatedness testing was done with PLINK [PMID: 17701901], reporting pairs with genome identity (pi-hat) > 0.9 as ‘identical samples’ and with pi-hat > 0.2 as being closely related. After random shuffling, one individual from each related pair was excluded from downstream analysis. From groups with multiple related pairs (for example, a family), only one individual was kept. 

Principal component estimation was done with the same collection of SNPs on the non-related subset of individuals. We estimated the first 20 principal components and tested each of them for phenotype association (using logistic regression with study indicator variables included as covariates) and evalu¬ated their impact on the genome-wide test statistics using λ (the genomic control inflation factor based on the median χ2) after genome-wide association of the specified principal component. Based on this we decided which principal components to include PCA 1,2,3,4,5,6,8,10,15,19 and 20 for downstream analysis as associated covariates. 

iv. Association analysis


Association testing was carried out in PLINK, using the dosage data from the imputation and using 11 principal components as covariates, chosen as described above from the first 20 principal components. The whole genome scan had a genomic inflation (λGC) value of 1.13 (Supplementary Figure 2).



';


exit;





