Hi everyone,
I need to analyze exome sequencing data (normal-tumor paired samples) for learning and also for a project. The analysis is simple but I lack experience, so I need help.
To begin with, I followed few tutorials, GATK best practices guidelines, and other internet resources, and ended up with this sequence of commands to run the entire workflow. My goal is to find out mutated/altered/deleted/truncated genes in tumor samples (gastric cancer) and then link them with tumor initiation. Can anyone comment on this sequence whether it is correct, or does it need any modification? Thank you in advance.
My sequence of commands is as (I deleted comments to make it shorter):
#!/bin/bash
#It will stop the file at first error
set -e
dir=/mnt/gc_exom_analysis/resources_broad_gatk2/
#The reference and vcf directories
REF=/mnt/gc_exom_analysis/resources_broad_gatk2/hg38_v0_Homo_sapiens_assembly38_1.fasta
snpEff=/home/fsbserver/snpEff/snpEff.jar
SnpSift=/home/fsbserver/snpEff/SnpSift.jar
#For variant; list is long
VCF1=/mnt/gc_exom_analysis/resources_broad_gatk2/hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf
#Dummy read group info can be added to bypass error;
# ID = ReadGroup identifier; PU =plateform Unit SM = sample
# PL = plateform technology used to produce the read
# LB DNA preparation library identifier
# to add dummy read groups
RG="@RG\tID:sample_1\tLB:sample_1\tPL:ILLUMINA\tPM:HISEQ\tSM:sample_1"
#bwa mem2 alignment
bwa-mem2 mem -t 12 -M -R $RG $REF *1_N.fastq.gz *2_N.fastq.gz > normal.sam
bwa-mem2 mem -t 12 -M -R $RG $REF *_2_T.fastq.gz > tumor.sam
# sort the alignment by coordinates
picard SortSam INPUT=normal.sam OUTPUT=normal_sorted.bam SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT TMP_DIR=/mnt/temp_dir
picard SortSam INPUT=tumor.sam OUTPUT=tumor_sorted.bam SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT TMP_DIR=/mnt/temp_dir
# computing statistics to see how well the reads aligned to the reference genome.
samtools flagstat normal_sorted.bam > align_mat_normal.txt
samtools flagstat tumor_sorted.bam > align_mat_tumor.txt
picard MarkDuplicates INPUT=normal_sorted.bam \
OUTPUT=normal_sort_marked.bam \
METRICS_FILE=align_mat_normal.txt \
ASSUME_SORTED=true \
VALIDATION_STRINGENCY=SILENT
picard MarkDuplicates INPUT=tumor_sorted.bam \
OUTPUT=tumor_sort_marked.bam \
METRICS_FILE=align_mat_tumor.txt \
ASSUME_SORTED=true \
VALIDATION_STRINGENCY=SILENT
#index generation
samtools index normal_sort_marked.bam
samtools index tumor_sort_marked.bam
#Base Recalibrator has been divided into two steps:
# (1) calculate base frequencies using BaseRecalibrator
# (2) apply base recalibration using "GATK ApplyBQSR"
#This tool performs the second pass in a two-stage process called Base Quality Score Recalibration (BQSR).
#Specifically, it recalibrates the base qualities of the input reads based on the recalibration table produced by the BaseRecalibrator tool,
#and outputs a recalibrated BAM or CRAM file.
gatk BaseRecalibrator -R $REF --known-sites $VCF1 \
-I normal_sort_marked.bam -O normal_bqsr.table
gatk BaseRecalibrator -R $REF --known-sites $VCF1 \
-I tumor_sort_marked.bam -O tumor_bqsr.table
gatk ApplyBQSR -R $REF -I normal_sort_marked.bam --bqsr-recal-file normal_bqsr.table -O normal_bqsr.bam
gatk ApplyBQSR -R $REF -I tumor_sort_marked.bam --bqsr-recal-file tumor_bqsr.table -O tumor_bqsr.bam
#PrintRead is to filter reads based on various criteria
#The default filter is "WellFormedReadFilter" and along with this, additional filter can be applied
gatk PrintReads -R $REF -I normal_bqsr.bam --read-filter NotDuplicateReadFilter -O normal_filter_PR.bam
gatk PrintReads -R $REF -I tumor_bqsr.bam --read-filter NotDuplicateReadFilter -O tumor_filter_PR.bam
# Call somatic short mutations via local assembly of haplotypes.
# Short mutations include single nucleotide (SNA) and insertion and deletion (indel) alterations.
# The caller uses a Bayesian somatic genotyping model that differs from the original
# MuTect by Cibulskis et al., 2013 and uses the assembly-based machinery of HaplotypeCaller. Of note, Mutect2 v4.1.0.0 onwards enables joint analysis of multiple samples.
gatk Mutect2 -R $REF -I normal_filter_PR.bam \
-I tumor_filter_PR.bam \
-germline-resource $dir/somatic-hg38_af-only-gnomad.hg38.vcf.gz \
-pon $dir/somatic-hg38_1000g_pon.hg38.vcf.gz \
--f1r2-tar-gz NandT_f1r2.tar.gz -O NandT_raw.vcf
#Read orientation
gatk LearnReadOrientationModel -I NandT_f1r2.tar.gz -O NandT_read-orientation-model.tar.gz
#
gatk GetPileupSummaries -I tumor_filter_PR.bam \
-V $dir/somatic-hg38_small_exac_common_3.hg38.vcf.gz \
-L $dir/somatic-hg38_small_exac_common_3.hg38.vcf.gz \
-O tumor_pileup.table
gatk GetPileupSummaries -I normal_filter_PR.bam \
-V $dir/somatic-hg38_small_exac_common_3.hg38.vcf.gz \
-L $dir/somatic-hg38_small_exac_common_3.hg38.vcf.gz \
-O normal_pileup.table
#cross contamination or technical contamination
gatk CalculateContamination -I tumor_pileup.table -matched normal_pileup.table -O NandT_contamination.table
gatk FilterMutectCalls -R $REF -V NandT_raw.vcf --contamination-table NandT_contamination.table \
-ob-priors NandT_read-orientation-model.tar.gz -O NandT_no_contamination.vcf
# Separate vaiants into SNP and INDELs
gatk SelectVariants -R $REF -V NandT_no_contamination.vcf -select-type-to-include SNP -O NandT_no_contamination_SNP.vcf
gatk SelectVariants -R $REF -V NandT_no_contamination.vcf -select-type-to-include INDEL -O NandT_no_contamination_INDEL.vcf
# Filter variants absed on read quality; if the requested filter is absent, it will raise a warning
gatk VariantFiltration -R $REF -V NandT_no_contamination_SNP.vcf -O NandT_filtered_SNP.vcf \
--filter-name "QD_filter" -filter "QD < 2.0 " \
--filter-name "FS_filter" -filter "FS > 60.0 " \
--filter-name "MQ_filter" -filter "MQ < 40.0 " \
--filter-name "SOR_filter" -filter "SOR > 10.0 "
gatk VariantFiltration -R $REF -V NandT_no_contamination_INDEL.vcf -O NandT_filtered_INDEL.vcf \
--filter-name "QD_filter" -filter "QD < 2.0 " \
--filter-name "FS_filter" -filter "FS > 200.0 " \
--filter-name "SOR_filter" -filter "SOR > 10.0 "
#this add the SNP Ids from dbSNP
java -Xmx8g -jar $SnpSift annotate $VCF1 NandT_filtered_SNP.vcf > NandT_SNP_dbSNP.vcf
java -Xmx8g -jar $SnpSift annotate $VCF1 NandT_filtered_INDEL.vcf > NandT_INDEL_dbSNP.vcf
# SnpEff annotate the filtered variants
java -Xmx8g -jar $snpEff -s NandT_SNP_snpEFF.html -v GRCh38.86 NandT_SNP_dbSNP.vcf > NandT_SNP_ann_snpEff.vcf
java -Xmx8g -jar $snpEff -s NandT_INDEL_snpEFF.html -v GRCh38.86 NandT_INDEL_dbSNP.vcf > NandT_INDEL_ann_snpEff.vcf