!#/bin/bash

#while getopts -g:-w:-c:-p:-k: option

 while [[ $# -gt 0 ]]; do
      opt="$1"
      shift;
      current_arg="$1"
       case "$opt" in
                 "-g") GENO="$1"; shift;;
                 "-w") WORKING="$1"; shift;;
                 "-c") COHORT="$1"; shift;;
                 "-k") RETAIN="$1"; shift;;
                 "-p") PLINKBIN="$1"; shift;;
        esac
done

#############################################################################################
# This script creates checksums to allow checking of relatedness without the exchange of    #
# raw data. It was created to facilitate colaboration between the ENIGMA and PGC Consortia. #
#                                                                                           #
# Created: Sarah Medland                                         #
# Last edit: Sarah Medland (28 Jan 2014)                                                    #
#                                                                                           #
#############################################################################################

echo "#############################################################################################

This script creates checksums to allow checking of relatedness without the exchange of raw data. 
It was created to facilitate colaboration between the ENIGMA and PGC Consortia. 
                                                                                           
 Created: Sarah Medland (ENIGMA) & Ben Neale (PGC)                                         
 Last edit: Sarah Medland (26 Jan 2014)                                                    
                                                                                           
 The first 3 arguments are required - the last 2 are optional
   -g is the prefix and location of the plink genotype files  
        - this will be passed to a bfile command. The suffixes (.bed/.bim/.fam) are not required
        - if these are stored in the pwd just type the file name                           
   -w is the location of the SNP and Allele lists - this will be your working folder
       - if these are stored in the pwd type \${PWD}                           
       - the checksums will be created in a subfolder (checkSum_\$COHORT) within this dirrectory	   
   -c is the cohort name prefix eg QTIM     

   -k  is the name and location of the list of people (FID and IID) to include in the analyses 
       - if this file is in the pwd just type the file name
       - eg ${PWD}/QTIM.txt
   -p allows you to specify the path to the plink binary 
       - include the name of the binary here
       - eg /home/sarah/bin/plink64_1.07
       

#############################################################################################

 The arguments you have entered are as follows:
   -g is $GENO
   -w is $WORKING
   -c is $COHORT

   -k is $RETAIN
   -p is $PLINKBIN

#############################################################################################

"


echo "Checking arguments and files"

##cheking $GENO
{
if [ -z "$GENO" ]
  then
    echo "
  ERROR exiting now: Please check you have specified the location of the raw data using the -g flag "
    exit 0
fi
}
#checking files exist
{
if [ ! -f $GENO.bed ]; then
    echo "
  ERROR exiting now: The bed file $GENO.bed for the raw genotypes could not be found in $GENO  
  Please check the location of this file"
    exit 0 ; else
    echo "
  . The bed file $GENO.bed will be used in these analyses"
fi
}
{
if [ ! -f $GENO.bim ]; then   
    echo "
  ERROR exiting now: The bim file $GENO.bim for the raw genotypes could not be found in $GENO 
  Please check the location of this file"
    exit 0 ; else
    echo "
  . The bim file $GENO.bim will be used in these analyses"
fi
}
{
if [ ! -f $GENO.fam ]; then   
    echo "
  ERROR exiting now: The fam file $GENO.fam for the raw genotypes could not be found in $GENO 
  Please check the location of this file"
    exit 0 ; else
    echo "
  . The fam file $GENO.fam will be used in these analyses"
fi
}

## checking $WORKING
{
if [ -z "$WORKING" ]
  then
    echo "
  ERROR exiting now: Please check you have specified the location of the SNP and Allele lists using the -w flag
  These files were provided in a zipfile and $WORKING is meant to be the path of the folder where they were unzipped - if these are stored in the pwd type \${PWD} 
  "
    exit 0
fi
}
#checking the SNP and Allele files are there
{
for ((i=1;i<=130;i++))
do
{
if [ ! -f $WORKING/"$i"_SNPs ]; then
    echo "
  ERROR exiting now: The file "$i"_SNPs  could not be found in $WORKING please check the location of these files
  (these files were provided in a zipfile and $WORKING is meant to be the path of the folder where they were unzipped)"
    exit 0
fi }
{
if [ ! -f $WORKING/"$i"_Alleles ]; then
    echo "
  ERROR exiting now: The file "$i"_Allele could not be found in $WORKING please check the location of these files
  (these files were provided in a zipfile and $WORKING is meant to be the path of the folder where they were unzipped)"
    exit 0
fi }
done
}
{
if [ ! -f $WORKING/SNP.list ]; then
    echo "
  ERROR exiting now: The file SNP.list could not be found in $WORKING please check the location of these files
  (these files were provided in a zipfile and $WORKING is meant to be the path of the folder where they were unzipped)"
    exit 0
fi
}

#Checking that $WORKING is writable 
echo "checking $WORKING is writable"  > $WORKING/write.temp
{
if [ ! -f $WORKING/write.temp ]; then
    echo "
  ERROR exiting now: Please check that you have read write permissions to $WORKING "                              
    exit 0
fi
##checking $COHORT
}
{
if [ -z "$COHORT" ]
  then
    echo "
  ERROR exiting now: Please check you have supplied a cohort name using the -c flag"
    exit 0
fi
}
##checking $RETAIN
{
if [ -z "$RETAIN" ]
  then
    echo "

NOTE: You have chosen not to supply an id list using the -k flag - a keep command will not be added to the plink analyses";
else 
    echo "

NOTE: You have chosen to supply an id list using the -k flag - a keep command will be added to the plink analyses "
fi
}


##checking $PLINKBIN
{
if [ -z "$PLINKBIN" ]
  then
    echo "

NOTE: You have chosen not to specify the path to the plink binary using the -p flag - it is assumed to be installed in a dirrectory listed within your path

";        
else
    echo "

NOTE: You have chosen to specify the path to the plink binary using the -p flag. 

"
fi
}




echo "#############################################################################################"

echo "Making the dirrectory $WORKING/checkSum_$COHORT to hold the checksums
"
cd $WORKING
mkdir checkSum_$COHORT

echo "Extracting the SNPs that will be used to make checksums...
"
 

{
if [ -z "$PLINKBIN" ]
  then

  {
if [ -z "$RETAIN" ]
  then
     plink --noweb --bfile $GENO --extract SNP.list --make-bed --out checkSum_$COHORT/snps_for_checksum.$COHORT   ;
  else
     plink --noweb --bfile $GENO --extract SNP.list --make-bed --out checkSum_$COHORT/snps_for_checksum.$COHORT  --keep $RETAIN

fi   
} ;
else
{
if [ -z "$RETAIN" ]
  then
     $PLINKBIN --noweb --bfile $GENO --extract SNP.list --make-bed --out checkSum_$COHORT/snps_for_checksum.$COHORT   ;
  else
     $PLINKBIN --noweb --bfile $GENO --extract SNP.list --make-bed --out checkSum_$COHORT/snps_for_checksum.$COHORT  --keep $RETAIN

fi   
}
fi
}


#{                    
#if [ -z "$RETAIN" ]
#  then
#     plink --noweb --bfile $GENO --extract SNP.list --make-bed --out checkSum_$COHORT/snps_for_checksum.$COHORT   ; 
#  else 
#     plink --noweb --bfile $GENO --extract SNP.list --make-bed --out checkSum_$COHORT/snps_for_checksum.$COHORT  --keep $RETAIN 
#fi               
#}

echo "#############################################################################################"

echo "
Making checksums..."

# snp extraction and check sum calculation
for ((i=1; i<=130; i++))
#for ((i=1; i<=3; i++))
do

echo "
Working on list $i of 130"
# extracts snps
{
if [ -z "$PLINKBIN" ]
  then
plink --noweb --bfile checkSum_$COHORT/snps_for_checksum.$COHORT --recodeA --extract $WORKING/"$i"_SNPs --recode-allele $WORKING/"$i"_Alleles --out checkSum_$COHORT/CheckSum.$i.$COHORT >> checkSum_$COHORT/checksum_extraction.logs ;
else
$PLINKBIN --noweb --bfile checkSum_$COHORT/snps_for_checksum.$COHORT --recodeA --extract $WORKING/"$i"_SNPs --recode-allele $WORKING/"$i"_Alleles --out checkSum_$COHORT/CheckSum.$i.$COHORT >> checkSum_$COHORT/checksum_extraction.logs
fi
}


{
if [ ! -f checkSum_$COHORT/CheckSum.$i.$COHORT.raw ]; then
    echo "
  ERROR exiting now: The file CheckSum.$i.$COHORT.raw  was not created - Please check the $WORKING/checkSum_$COHORT/CheckSum.$i.$COHORT.log file for details"
    exit 0
fi
}

#creates check sums
sed 's/ //3g' checkSum_$COHORT/CheckSum.$i.$COHORT.raw | grep -v FID | sed -r 's/(\S+\s)(\S+)(.*)/echo "\1:::\2 $(md5sum <<<"\3") \"/e;s/ - //' | sed 's/ :::/:/' > checkSum_$COHORT/checkSum_$i.$COHORT.txt
#rm checkSum_$COHORT/CheckSum.$i.$COHORT.raw
done



echo "#############################################################################################"


echo "
Finished making checksums. Now merging the files...
"

#############################################################################################

# Getting all possible IDs
for ((i=1;i<=130;i++))
do
awk '{print $1}' checkSum_$COHORT/checkSum_$i.$COHORT.txt >> checkSum_$COHORT/temp1
done
sort checkSum_$COHORT/temp1 | uniq > checkSum_$COHORT/ID.list

# Replacing missings with NULLs
for ((i=1;i<=130;i++))
do
echo "ID List"$i" " > checkSum_$COHORT/temp"$i"
join -t ' ' -a 1 -a 2 -1 1 -2 1 -e NULL -o 0,2.2 checkSum_$COHORT/ID.list <(sort checkSum_$COHORT/checkSum_$i.$COHORT.txt) >> checkSum_$COHORT/temp"$i"
done
for ((i=2;i<=130;i++))
do
echo "checkSum_$COHORT/temp"$i"" >> checkSum_$COHORT/file.list
done

#Merging
mv checkSum_$COHORT/temp1 checkSum_$COHORT/building
awk '$0 !~ /#/{arr[$1]=arr[$1] " " $2}END{for(i in arr)print i,arr[i]}' checkSum_$COHORT/building $(<checkSum_$COHORT/file.list) > checkSum_$COHORT/merged
grep ID  checkSum_$COHORT/merged > checkSum_$COHORT/"$COHORT"_Merged_checksums.txt
grep -v ID checkSum_$COHORT/merged | sort >> checkSum_$COHORT/"$COHORT"_Merged_checksums.txt
rm checkSum_$COHORT/temp*
rm checkSum_$COHORT/building
rm checkSum_$COHORT/merged
rm checkSum_$COHORT/ID.list
rm checkSum_$COHORT/file.list

echo "
Finished merging. Now tidying up...
"


zip -mTr checkSum_$COHORT/checksum_buildingblocks_do_not_upload.zip checkSum_$COHORT/CheckSum.*  checkSum_$COHORT/snps_for_checksum* checkSum_$COHORT/checkSum_*.txt  > checkSum_$COHORT/temp.txt
#rm  checkSum_$COHORT/temp.txt
zip -mTr checkSum_$COHORT/"$COHORT"_checkSums_ready_to_upload.zip checkSum_$COHORT/"$COHORT"_Merged_checksums.txt  checkSum_$COHORT/checksum_extraction.logs 

echo "
Your zip file (checkSum_$COHORT/"$COHORT"_checkSums_ready_to_upload.zip) is now ready for upload.
Thank you for running these analyses.

#############################################################################################

"

