18 April 2022 0 8K Report

Hi everyone,

I need to analyze exome sequencing data (normal-tumor paired samples) for learning and also for a project. The analysis is simple but I lack experience, so I need help.

To begin with, I followed few tutorials, GATK best practices guidelines, and other internet resources, and ended up with this sequence of commands to run the entire workflow. My goal is to find out mutated/altered/deleted/truncated genes in tumor samples (gastric cancer) and then link them with tumor initiation. Can anyone comment on this sequence whether it is correct, or does it need any modification? Thank you in advance.

My sequence of commands is as (I deleted comments to make it shorter):

#!/bin/bash

#It will stop the file at first error

set -e

dir=/mnt/gc_exom_analysis/resources_broad_gatk2/

#The reference and vcf directories

REF=/mnt/gc_exom_analysis/resources_broad_gatk2/hg38_v0_Homo_sapiens_assembly38_1.fasta

snpEff=/home/fsbserver/snpEff/snpEff.jar

SnpSift=/home/fsbserver/snpEff/SnpSift.jar

#For variant; list is long

VCF1=/mnt/gc_exom_analysis/resources_broad_gatk2/hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf

#Dummy read group info can be added to bypass error;

# ID = ReadGroup identifier; PU =plateform Unit SM = sample

# PL = plateform technology used to produce the read

# LB DNA preparation library identifier

# to add dummy read groups

RG="@RG\tID:sample_1\tLB:sample_1\tPL:ILLUMINA\tPM:HISEQ\tSM:sample_1"

#bwa mem2 alignment

bwa-mem2 mem -t 12 -M -R $RG $REF *1_N.fastq.gz *2_N.fastq.gz > normal.sam

bwa-mem2 mem -t 12 -M -R $RG $REF *_2_T.fastq.gz > tumor.sam

# sort the alignment by coordinates

picard SortSam INPUT=normal.sam OUTPUT=normal_sorted.bam SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT TMP_DIR=/mnt/temp_dir

picard SortSam INPUT=tumor.sam OUTPUT=tumor_sorted.bam SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT TMP_DIR=/mnt/temp_dir

# computing statistics to see how well the reads aligned to the reference genome.

samtools flagstat normal_sorted.bam > align_mat_normal.txt

samtools flagstat tumor_sorted.bam > align_mat_tumor.txt

picard MarkDuplicates INPUT=normal_sorted.bam \

OUTPUT=normal_sort_marked.bam \

METRICS_FILE=align_mat_normal.txt \

ASSUME_SORTED=true \

VALIDATION_STRINGENCY=SILENT

picard MarkDuplicates INPUT=tumor_sorted.bam \

OUTPUT=tumor_sort_marked.bam \

METRICS_FILE=align_mat_tumor.txt \

ASSUME_SORTED=true \

VALIDATION_STRINGENCY=SILENT

#index generation

samtools index normal_sort_marked.bam

samtools index tumor_sort_marked.bam

#Base Recalibrator has been divided into two steps:

# (1) calculate base frequencies using BaseRecalibrator

# (2) apply base recalibration using "GATK ApplyBQSR"

#This tool performs the second pass in a two-stage process called Base Quality Score Recalibration (BQSR).

#Specifically, it recalibrates the base qualities of the input reads based on the recalibration table produced by the BaseRecalibrator tool,

#and outputs a recalibrated BAM or CRAM file.

gatk BaseRecalibrator -R $REF --known-sites $VCF1 \

-I normal_sort_marked.bam -O normal_bqsr.table

gatk BaseRecalibrator -R $REF --known-sites $VCF1 \

-I tumor_sort_marked.bam -O tumor_bqsr.table

gatk ApplyBQSR -R $REF -I normal_sort_marked.bam --bqsr-recal-file normal_bqsr.table -O normal_bqsr.bam

gatk ApplyBQSR -R $REF -I tumor_sort_marked.bam --bqsr-recal-file tumor_bqsr.table -O tumor_bqsr.bam

#PrintRead is to filter reads based on various criteria

#The default filter is "WellFormedReadFilter" and along with this, additional filter can be applied

gatk PrintReads -R $REF -I normal_bqsr.bam --read-filter NotDuplicateReadFilter -O normal_filter_PR.bam

gatk PrintReads -R $REF -I tumor_bqsr.bam --read-filter NotDuplicateReadFilter -O tumor_filter_PR.bam

# Call somatic short mutations via local assembly of haplotypes.

# Short mutations include single nucleotide (SNA) and insertion and deletion (indel) alterations.

# The caller uses a Bayesian somatic genotyping model that differs from the original

# MuTect by Cibulskis et al., 2013 and uses the assembly-based machinery of HaplotypeCaller. Of note, Mutect2 v4.1.0.0 onwards enables joint analysis of multiple samples.

gatk Mutect2 -R $REF -I normal_filter_PR.bam \

-I tumor_filter_PR.bam \

-germline-resource $dir/somatic-hg38_af-only-gnomad.hg38.vcf.gz \

-pon $dir/somatic-hg38_1000g_pon.hg38.vcf.gz \

--f1r2-tar-gz NandT_f1r2.tar.gz -O NandT_raw.vcf

#Read orientation

gatk LearnReadOrientationModel -I NandT_f1r2.tar.gz -O NandT_read-orientation-model.tar.gz

#

gatk GetPileupSummaries -I tumor_filter_PR.bam \

-V $dir/somatic-hg38_small_exac_common_3.hg38.vcf.gz \

-L $dir/somatic-hg38_small_exac_common_3.hg38.vcf.gz \

-O tumor_pileup.table

gatk GetPileupSummaries -I normal_filter_PR.bam \

-V $dir/somatic-hg38_small_exac_common_3.hg38.vcf.gz \

-L $dir/somatic-hg38_small_exac_common_3.hg38.vcf.gz \

-O normal_pileup.table

#cross contamination or technical contamination

gatk CalculateContamination -I tumor_pileup.table -matched normal_pileup.table -O NandT_contamination.table

gatk FilterMutectCalls -R $REF -V NandT_raw.vcf --contamination-table NandT_contamination.table \

-ob-priors NandT_read-orientation-model.tar.gz -O NandT_no_contamination.vcf

# Separate vaiants into SNP and INDELs

gatk SelectVariants -R $REF -V NandT_no_contamination.vcf -select-type-to-include SNP -O NandT_no_contamination_SNP.vcf

gatk SelectVariants -R $REF -V NandT_no_contamination.vcf -select-type-to-include INDEL -O NandT_no_contamination_INDEL.vcf

# Filter variants absed on read quality; if the requested filter is absent, it will raise a warning

gatk VariantFiltration -R $REF -V NandT_no_contamination_SNP.vcf -O NandT_filtered_SNP.vcf \

--filter-name "QD_filter" -filter "QD < 2.0 " \

--filter-name "FS_filter" -filter "FS > 60.0 " \

--filter-name "MQ_filter" -filter "MQ < 40.0 " \

--filter-name "SOR_filter" -filter "SOR > 10.0 "

gatk VariantFiltration -R $REF -V NandT_no_contamination_INDEL.vcf -O NandT_filtered_INDEL.vcf \

--filter-name "QD_filter" -filter "QD < 2.0 " \

--filter-name "FS_filter" -filter "FS > 200.0 " \

--filter-name "SOR_filter" -filter "SOR > 10.0 "

#this add the SNP Ids from dbSNP

java -Xmx8g -jar $SnpSift annotate $VCF1 NandT_filtered_SNP.vcf > NandT_SNP_dbSNP.vcf

java -Xmx8g -jar $SnpSift annotate $VCF1 NandT_filtered_INDEL.vcf > NandT_INDEL_dbSNP.vcf

# SnpEff annotate the filtered variants

java -Xmx8g -jar $snpEff -s NandT_SNP_snpEFF.html -v GRCh38.86 NandT_SNP_dbSNP.vcf > NandT_SNP_ann_snpEff.vcf

java -Xmx8g -jar $snpEff -s NandT_INDEL_snpEFF.html -v GRCh38.86 NandT_INDEL_dbSNP.vcf > NandT_INDEL_ann_snpEff.vcf

More Ayaz Anwar's questions See All
Similar questions and discussions