#!/bin/bash # TwistAllianceClinicalResearchExome_Covered_Targets_hg38.interval_list was provided by Edyta from this local dir: /broad/tag_working/edytam/dragen/exome_low-pass-genome/bed/original # and was converted from bed to interval list with picard # Clinvar code below was adapted from Ben Weisburd in 2019 picard=/Users/mshand/Github/picard/build/libs/picard.jar gatk=/Users/mshand/Github/gatk/build/libs/gatk.jar gsutil cp gs://gcp-public-data--broad-references/hg38/v0/exome_calling_regions.v1.1.interval_list . wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz mv clinvar.vcf.gz clinvar_20230121.vcf.gz #remove variants that are not on the canonical autosomes and filter for just pathogenic gzcat clinvar_20230121.vcf.gz | grep -i '^#\|pathogenic' | grep -v '^MT' | sed $'s/^/chr/' | sed $'s/chr[#]/#/' | bgzip > clinvar_20230121.GRCh38.pathogenic.vcf.gz zgrep '^#' clinvar_20230121.GRCh38.pathogenic.vcf.gz | grep -v CHROM > clinvarHeader.orig.vcf zgrep '^#CHROM' clinvar_20230121.GRCh38.pathogenic.vcf.gz > clinvarHeaderTail.vcf zgrep '^[^#]' clinvar_20230121.GRCh38.pathogenic.vcf.gz > clinvarVariants.txt cat clinvarHeader.orig.vcf hg38_vcf_contigs.txt clinvarHeaderTail.vcf clinvarVariants.txt > clinvar.fixed.vcf #Limit clinvar variants to only non-coding (outside of what is already covered by exomev1.1 and Twist) #Remove deletions longer than 500 bases since we won't be able to call them from BGE short reads java -jar $gatk SelectVariants -XL TwistAllianceClinicalResearchExome_Covered_Targets_hg38.interval_list \ -XL exome_calling_regions.v1.1.interval_list -V clinvar.fixed.vcf -O noncoding_no_long_dels_pathogenic.vcf \ -select '!(vc.isIndel()&&vc.isBiallelic()&&vc.getIndelLengths().get(0)<-500)' #exomev1.1 already has padding, but Twist and clinvar need 50bp of padding added java -jar $picard IntervalListTools --PADDING 50 -I TwistAllianceClinicalResearchExome_Covered_Targets_hg38.interval_list \ -O TwistAllianceClinicalResearchExome_Covered_Targets_hg38_50padding.interval_list java -jar $picard IntervalListTools --PADDING 50 -I noncoding_no_long_dels_pathogenic.vcf \ -O clinvar_20230121_noncoding_non_long_deletion_pathogenic_50padding.interval_list #Add names to twist and clinvar interval lists, exomev1.1 already has this #Name the twist intervals with an incremented number so they are unique grep -v '^@' TwistAllianceClinicalResearchExome_Covered_Targets_hg38_50padding.interval_list | awk 'sub(/\./,"twist_target_"c++)' > TwistNames.intervals grep '^@' TwistAllianceClinicalResearchExome_Covered_Targets_hg38_50padding.interval_list > twistHeader.txt cat twistHeader.txt TwistNames.intervals > TwistAllianceClinicalResearchExome_Covered_Targets_hg38_50padding_named.interval_list #Clinvar has a number in the name column (5th column) so leave it as is but add a ClinVar string grep -v '^@' clinvar_20230121_noncoding_non_long_deletion_pathogenic_50padding.interval_list | awk 'BEGIN{OFS="\t"}$5="clinvar_pathogenic_"$5' > clinvarNames.intervals grep '^@' clinvar_20230121_noncoding_non_long_deletion_pathogenic_50padding.interval_list > clinvarHeader.txt cat clinvarHeader.txt clinvarNames.intervals > clinvar_20230121_noncoding_non_long_deletion_pathogenic_50padding_named.interval_list #merge all three lists java -jar $picard IntervalListTools -UNIQUE \ -I exome_calling_regions.v1.1.interval_list \ -I clinvar_20230121_noncoding_non_long_deletion_pathogenic_50padding_named.interval_list \ -I TwistAllianceClinicalResearchExome_Covered_Targets_hg38_50padding_named.interval_list \ -O bge_exome_calling_regions.v1.interval_list #Produced 214211 intervals totalling 96619658 bases