forked from ParkerLab/bioinf545
-
Notifications
You must be signed in to change notification settings - Fork 0
/
commands
executable file
·97 lines (71 loc) · 4.13 KB
/
commands
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/bin/bash
set -e # stop at any error
LAB_DIR="${HOME}/bioinf545/labs/atac-seq"
mkdir -p $LAB_DIR
cd $LAB_DIR
LAB_DATA="${1:-/class/data/bio545/atac-seq-lab}"
REF_DIR="${2:-${LAB_DATA}/data}"
REF="${3:-hg19}"
PICARD_JAR="${4:-${LAB_DATA}/bin/picard.jar}"
export R_LIBS_SITE=${LAB_DATA}/R/%p/%v
# Copy lab scripts and input data
cp ${LAB_DATA}/data/*.gz ${LAB_DIR}
cp ${LAB_DATA}/data/*.meme ${LAB_DIR}
export PATH=${LAB_DATA}/bin:${LAB_DATA}/ve/bin:$PATH
# Run FastQC on the file of first reads
fastqc SRR891268.1.fq.gz
# check FastQC output
# firefox SRR891268.1_fastqc.html
# Trim adapter content from the reads
trim_adapters SRR891268.1.fq.gz SRR891268.2.fq.gz
# Check the results of trimming
# zdiff -u SRR891268.1.fq.gz SRR891268.1.trimmed.fastq.gz | less
# Align the trimmed reads to the human reference genome
bwa mem -t 4 ${REF_DIR}/${REF} SRR891268.1.trimmed.fastq.gz SRR891268.2.trimmed.fastq.gz | samtools sort -@ 4 -O bam -T SRR891268.tmp -o SRR891268.bam -
# Mark duplicate aligned reads
java -Xmx8g -jar ${PICARD_JAR} MarkDuplicates I=SRR891268.bam O=SRR891268.md.bam ASSUME_SORTED=true METRICS_FILE=SRR891268.markdup.metrics VALIDATION_STRINGENCY=LENIENT
# Index the BAM file of aligned reads with duplicates marked
samtools index SRR891268.md.bam
# Filter the aligned reads to produce a file of high-quality, properly paired and mapped unique alignments to autosomal references
export CHROMOSOMES=$(samtools view -H SRR891268.md.bam | grep '^@SQ' | cut -f 2 | grep -v -e _ -e chrM -e chrX -e chrY -e 'VN:' | sed 's/SN://' | xargs echo); samtools view -b -h -f 3 -F 4 -F 8 -F 256 -F 1024 -F 2048 -q 30 SRR891268.md.bam $CHROMOSOMES > SRR891268.pruned.bam
# Index the pruned BAM
samtools index SRR891268.pruned.bam
# Call peaks -- find regions enriched for ATAC-seq transpositions
macs2 callpeak -t SRR891268.pruned.bam -n SRR891268.broad -g hs -q 0.05 --nomodel --shift -100 --extsize 200 -B --broad
# Create a bigWig file from the MACS2 output, so we can look at the peaks in the UCSC Genome Browser
LC_COLLATE=C sort -k1,1 -k2,2n SRR891268.broad_treat_pileup.bdg > SRR891268.broad_treat_pileup.sorted.bdg
bedGraphToBigWig SRR891268.broad_treat_pileup.sorted.bdg ${REF_DIR}/${REF}.chrom_sizes SRR891268.broad_peaks.bw
#
# Create background file for chromosome 20
#
zcat chr20.fa.gz | fasta-get-markov /dev/stdin chr20.bg
#
# Find CTCF motifs on chromosome 20
#
zcat chr20.fa.gz | fimo -bgfile chr20.bg CTCF_known2.meme /dev/stdin
#
# Convert FIMO output GFF to a BED file
#
gff2bed < fimo_out/fimo.gff | awk 'BEGIN {IFS="\t"; OFS="\t";} {print $1,$2,$3,$4,$5,$6}' | gzip > NA12878.CTCF_known2.chr20.bed.gz
# Create signal matrix for CENTIPEDE. The make_cut_matrix program
# takes a BAM file containing aligned ATAC-seq reads and a BED file
# containing occurrences of a given motif in the reference genome, as
# found by the FIMO program from the MEME suite. FIMO finds these
# locations given a transcription factor-specific position weight
# matrix that matches sequence motifs to which the TF is known to
# bind. make_cut_matrix measures ATAC-seq transposition events around
# the motif locations, which CENTIPEDE will use in determining the
# probability that the locations are bound by the transcription
# factor.
make_cut_matrix -v -d -p 2 -f 3 -F 4 -F 8 -q 30 SRR891268.pruned.bam NA12878.CTCF_known2.chr1.bed.gz | gzip -c > NA12878.CTCF_known2.matrix.gz
# Run CENTIPEDE with the matrix just generated (which contains a row
# of ATAC-seq transposition events around each motif location) and the
# BED file fed to make_cut_matrix. The FIMO score is in field 8 of the
# input BED file.
run_centipede.R NA12878.CTCF_known2.matrix.gz NA12878.CTCF_known2.chr1.bed.gz NA12878.CTCF_known2.centipede.bed.gz 8
# Extract the probably-bound motifs -- those to which CENTIPEDE
# assigned a posterior probability of at least 0.99. Its score is in
# the last column of its output. The awk command filters CENTIPEDE
# output for the 0.99 threshold, printing only the fields necessary to
# create a BED track in the UCSC Genome Browser.
zcat NA12878.CTCF_known2.centipede.bed.gz | awk 'BEGIN{IFS="\t"; OFS="\t"} $NF > 0.99 {print $1,$2,$3,$4,$5,$6,$NF}' | gzip > NA12878.CTCF_known2.bound.bed.gz