Skip to content

Latest commit

 

History

History
283 lines (241 loc) · 7.89 KB

20200923_snp.md

File metadata and controls

283 lines (241 loc) · 7.89 KB

snp

remove adapter

cutadapt_function(){
echo cutadapt -m 20 -a ${adapt1} -A ${adapt2} -o ${data_path}/out1.fastq -p ${data_path}/out2.fastq ${data_path}/${file1} ${data_path}/${file2}
cutadapt -m 20 -a ${adapt1} -A ${adapt2} -o ${data_path}/out1.fastq -p ${data_path}/out2.fastq ${data_path}/${file1} ${data_path}/${file2}

}


# sample1
data_path=/home/xugang/data/wangjydata/1.rawdata/0_FKDL202596126-1a/
adapt1=GGACTCC
adapt2=TATCCTCT
file1=0_FKDL202596126-1a_1.fq.gz
file2=0_FKDL202596126-1a_2.fq.gz
cutadapt_function

#sample2
data_path=/home/xugang/data/wangjydata/1.rawdata/0-jia_FKDL202596128-1a/
adapt1=TAGGCATG
adapt2=TATCCTCT
file1=0-jia_FKDL202596128-1a_1.fq.gz
file2=0-jia_FKDL202596128-1a_2.fq.gz
cutadapt_function
#sample3
data_path=/home/xugang/data/wangjydata/1.rawdata/1_FKDL202596127-1a/
adapt1=TAAGGCGA
adapt2=AGAGTAGA
file1=1_FKDL202596127-1a_1.fq.gz
file2=1_FKDL202596127-1a_2.fq.gz
cutadapt_function
#sample4
data_path=/home/xugang/data/wangjydata/1.rawdata/1-jia_FKDL202596129-1a/
adapt1=CGTACTAG
adapt2=AGAGTAGA
file1=1-jia_FKDL202596129-1a_1.fq.gz
file2=1-jia_FKDL202596129-1a_2.fq.gz
cutadapt_function

bowtie map

indexp=/home/xugang/data/wangjydata/ref/bowtie_index/emx1
indexp2=/home/xugang/data/wangjydata/ref/bowtie2_index/emx1


bowtief2(){
echo "bowtie2 $name";
bowtie2 -p 6 -N 1 -x ${indexp2} -1 ${datap}/out1.fastq -2 ${datap}/out2.fastq --seed 0 --very-sensitive -S ${datap}/${name}.sam > ${datap}/log.${name}.log

}

bowtief(){
echo "bowtie $name";
bowtie -p 6  ${indexp} -1 ${datap}/out1.fastq -2 ${datap}/out2.fastq -n 3 -S ${datap}/${name}.bowtie1.sam
}

datap=/home/xugang/data/wangjydata/1.rawdata/0_FKDL202596126-1a
name=0_fkd
bowtief
bowtief2

datap=/home/xugang/data/wangjydata/1.rawdata/0-jia_FKDL202596128-1a/
name=0-jia
bowtief
bowtief2

datap=/home/xugang/data/wangjydata/1.rawdata/1_FKDL202596127-1a/
name=1_fkd
bowtief
bowtief2

datap=/home/xugang/data/wangjydata/1.rawdata/1-jia_FKDL202596129-1a/
name=1-jia
bowtief
bowtief2

bam sort and index

sort(){
samtools view -S -b ${datap}/${name} > ${datap}/${name}.bam
samtools sort ${datap}/${name}.bam > ${datap}/${name}.sort.bam
rm ${datap}/${name}.bam
samtools index ${datap}/${name}.sort.bam
}

# file path
datap=/home/xugang/data/wangjydata/1.rawdata/0_FKDL202596126-1a/
# file name
name=0_fkd.sam
# function
sort
name=0_fkd.bowtie1.sam
# function
sort

# file path
datap=/home/xugang/data/wangjydata/1.rawdata/0-jia_FKDL202596128-1a
# file name
name=0-jia.sam
# function
sort
# file name
name=0-jia.bowtie1.sam
# function
sort

# file path
datap=/home/xugang/data/wangjydata/1.rawdata/1_FKDL202596127-1a
# file name
name=1_fkd.sam
# function
sort
# file name
name=1_fkd.bowtie1.sam
# function
sort

# file path
datap=/home/xugang/data/wangjydata/1.rawdata/1-jia_FKDL202596129-1a
# file name
name=1-jia.sam
# function
sort
# file name
name=1-jia.bowtie1.sam
# function
sort
snp(){
[ -d ${outp}/ ] || mkdir -p ${outp}/
freebayes -f ${genome} ${datap}/${name} >${outp}/${name}.vcf
}


# do not change
genome=/home/xugang/data/wangjydata/ref/emx1.fa
outp=/home/xugang/data/wangjydata/3.var.new

# file path
datap=/home/xugang/data/wangjydata/1.rawdata/0_FKDL202596126-1a/
# file name
name=0_fkd.sam.sort.bam
# function
snp
name=0_fkd.bowtie1.sam.sort.bam
# function
snp

# file path
datap=/home/xugang/data/wangjydata/1.rawdata/0-jia_FKDL202596128-1a
# file name
name=0-jia.sam.sort.bam
# function
snp
# file name
name=0-jia.bowtie1.sam.sort.bam
# function
snp

# file path
datap=/home/xugang/data/wangjydata/1.rawdata/1_FKDL202596127-1a
# file name
name=1_fkd.sam.sort.bam
# function
snp
# file name
name=1_fkd.bowtie1.sam.sort.bam
# function
snp

# file path
datap=/home/xugang/data/wangjydata/1.rawdata/1-jia_FKDL202596129-1a
# file name
name=1-jia.sam.sort.bam
# function
snp
# file name
name=1-jia.bowtie1.sam.sort.bam
# function
snp

Calling variants: from fastq to VCF

You've sequenced some samples. You have a reference genome or assembled set of contigs, and you'd like to determine reference-relative variants in your samples. You can use freebayes to detect the variants, following these steps:

  • Align your reads to a suitable reference (e.g. with bwa or MOSAIK)
  • Ensure your alignments have read groups attached so their sample may be identified by freebayes. Aligners allow you to do this, but you can also use bamaddrg to do so post-alignment.
  • Sort the alignments (e.g. sambamba sort).
  • Mark duplicates, for instance with sambamba markdup (if PCR was used in the preparation of your sequencing library)
  • Run freebayes on all your alignment data simultaneously, generating a VCF. The default settings should work for most use cases, but if your samples are not diploid, set the --ploidy and adjust the --min-alternate-fraction suitably.
  • Filter the output e.g. using reported QUAL and/or depth (DP) or observation count (AO).
  • Interpret your results.
  • (possibly, Iterate the variant detection process in response to insight gained from your interpretation)
bamcount(){
[ -d ${outp} ] || mkdir -p ${outp}
bam-readcount -f ${genome} ${datap}/${name} > ${outp}/${name}.txt 2>${outp}/${name}.log

}


# do not change
genome=/home/xugang/data/wangjydata/ref/emx1.fa
outp=/home/xugang/data/wangjydata/4.bam_count

# file path
datap=/home/xugang/data/wangjydata/1.rawdata/0_FKDL202596126-1a/
# file name
name=0_fkd.bowtie1.sam.sort.bam
# function
bamcount
name=0_fkd.sam.sort.bam
# function
bamcount

# file path
datap=/home/xugang/data/wangjydata/1.rawdata/0-jia_FKDL202596128-1a
# file name
name=0-jia.sam.sort.bam
# function
bamcount
# file name
name=0-jia.bowtie1.sam.sort.bam
# function
bamcount

# file path
datap=/home/xugang/data/wangjydata/1.rawdata/1_FKDL202596127-1a
# file name
name=1_fkd.sam.sort.bam
# function
bamcount
# file name
name=1_fkd.bowtie1.sam.sort.bam
# function
bamcount

# file path
datap=/home/xugang/data/wangjydata/1.rawdata/1-jia_FKDL202596129-1a
# file name
name=1-jia.sam.sort.bam
# function
bamcount
# file name
name=1-jia.bowtie1.sam.sort.bam
# function
bamcount

Normal Output

The output format prints to standard out as follows:

chr	position	reference_base	depth	base:count:avg_mapping_quality:avg_basequality:avg_se_mapping_quality:num_plus_strand:num_minus_strand:avg_pos_as_fraction:avg_num_mismatches_as_fraction:avg_sum_mismatch_qualities:num_q2_containing_reads:avg_distance_to_q2_start_in_q2_reads:avg_clipped_length:avg_distance_to_effective_3p_end   ...

Each tab separated field contains the following information, each one is tab-separated:

  • base → the base that all reads following in this field contain at the reported position i.e. C
  • count → the number of reads containing the base
  • avg_mapping_quality → the mean mapping quality of reads containing the base
  • avg_basequality → the mean base quality for these reads
  • avg_se_mapping_quality → mean single ended mapping quality
  • num_plus_strand → number of reads on the plus/forward strand
  • num_minus_strand → number of reads on the minus/reverse strand
  • avg_pos_as_fraction → average position on the read as a fraction (calculated with respect to the length after clipping). This value is normalized to the center of the read (bases occurring strictly at the center of the read have a value of 1, those occurring strictly at the ends should approach a value of 0)
  • avg_num_mismatches_as_fraction → average number of mismatches on these reads per base
  • avg_sum_mismatch_qualities → average sum of the base qualities of mismatches in the reads
  • num_q2_containing_reads → number of reads with q2 runs at the 3’ end
  • avg_distance_to_q2_start_in_q2_reads → average distance of position (as fraction of unclipped read length) to the start of the q2 run
  • avg_clipped_length → average clipped read length of reads
  • avg_distance_to_effective_3p_end → average distance to the 3’ prime end of the read (as fraction of unclipped read length)