From a7b95618d62a23386c5ccfabf41b0583474a3959 Mon Sep 17 00:00:00 2001 From: Stephanie LE GRAS Date: Mon, 6 Nov 2023 23:13:37 +0100 Subject: [PATCH] edit picard command line --- 2023/ebaiin1/chip-seq/hands-on/hands-on.Rmd | 100 ++++++++----- 2023/ebaiin1/chip-seq/hands-on/hands-on.html | 147 +++++++++++-------- 2 files changed, 154 insertions(+), 93 deletions(-) diff --git a/2023/ebaiin1/chip-seq/hands-on/hands-on.Rmd b/2023/ebaiin1/chip-seq/hands-on/hands-on.Rmd index 13f37de..abe1700 100644 --- a/2023/ebaiin1/chip-seq/hands-on/hands-on.Rmd +++ b/2023/ebaiin1/chip-seq/hands-on/hands-on.Rmd @@ -24,8 +24,6 @@ bibliography: references.bib knitr::opts_chunk$set(echo = TRUE, root.dir="~/Documents/Formations/2023/EBAIIn1/EBAII/2023/ebaiin1/chip-seq/hands-on") ``` -$^1$ GenomEast platform, IGBMC - # - Introduction ## - Goal @@ -83,8 +81,7 @@ SRA stores sequences in a FASTQ format. # - Connect to the server and set up your environment ## - Sign in to [Jupyterhub](https://jupyterhub.cluster.france-bioinformatique.fr) and open a Terminal 1. Go to [Jupyterhub](https://jupyterhub.cluster.france-bioinformatique.fr) -2. select reservation: **2325_ebaii_0**, CPUS: **4** and Memory (in GB): **10** -![alt text][jupebai] +2. select CPUS: **4** and Memory (in GB): **10** 3. Click on "Start" 4. In the launcher, click on "Terminal" in "Other" section. You should be in your home directory by default. Check it: ```{bash eval=FALSE} @@ -242,7 +239,9 @@ bowtie-build 5. Build the index for bowtie ```{bash eval=FALSE} ## Creating genome index : provide the path to the genome file and the name to give to the index (Escherichia_coli_K12) -bowtie-build ../../data/Escherichia_coli_K12.fasta Escherichia_coli_K12 +bowtie-build \ + ../../data/Escherichia_coli_K12.fasta \ + Escherichia_coli_K12 ``` 6. Go back to upper directory i.e 02-Mapping ```{bash eval=FALSE} @@ -336,7 +335,6 @@ cd /shared/projects//EBAII2023_chipseq/02-Mapping/bam * INPUT: input file name to mark for duplicate reads * OUTPUT: output file name * METRICS: file with duplicates marking statistics - * VALIDATION_STRINGENCY: Validation stringency for all SAM files read by picard. ```{bash eval=FALSE} ## Load picard @@ -344,18 +342,16 @@ module load picard/2.23.5 ## Run picard picard MarkDuplicates \ -CREATE_INDEX=true \ -INPUT=SRR576933.bam \ -OUTPUT=Marked_SRR576933.bam \ -METRICS_FILE=metrics_SRR576933.txt \ -VALIDATION_STRINGENCY=STRICT + -CREATE_INDEX true \ + -INPUT SRR576933.bam \ + -OUTPUT Marked_SRR576933.bam \ + -METRICS_FILE metrics_SRR576933.txt picard MarkDuplicates \ -CREATE_INDEX=true \ -INPUT=SRR576938.bam \ -OUTPUT=Marked_SRR576938.bam \ -METRICS_FILE=metrics_SRR576938.txt \ -VALIDATION_STRINGENCY=STRICT + -CREATE_INDEX true \ + -INPUT SRR576938.bam \ + -OUTPUT Marked_SRR576938.bam \ + -METRICS_FILE metrics_SRR576938.txt ``` To determine the number of duplicated reads marked by Picard, we can run the `samtools flagstat` command: @@ -396,7 +392,12 @@ cd 03-ChIPQualityControls ## Load deeptools in your environment module load deeptools/3.5.0 ## Run deeptools fingerprint -plotFingerprint --numberOfSamples 10000 -b ../02-Mapping/bam/SRR576933.bam ../02-Mapping/bam/SRR576934.bam ../02-Mapping/bam/SRR576938.bam -plot fingerprint_10000.png +plotFingerprint \ + --numberOfSamples 10000 \ + -b ../02-Mapping/bam/SRR576933.bam \ + ../02-Mapping/bam/SRR576934.bam \ + ../02-Mapping/bam/SRR576938.bam \ + -plot fingerprint_10000.png ``` 4. If plotFingerprint takes ages to run. Take the file that has already been prepared for the training. ```{bash eval=FALSE} @@ -496,8 +497,13 @@ Your directory structure should be like this: * --ignoreDuplicates: reads that have the same orientation and start position will be considered only once ```{bash eval=FALSE} bamCoverage --bam ../02-Mapping/bam/Marked_SRR576933.bam \ ---outFileName SRR576933_nodup.bw --outFileFormat bigwig --effectiveGenomeSize 4639675 \ ---normalizeUsing RPGC --skipNonCoveredRegions --extendReads 200 --ignoreDuplicates + --outFileName SRR576933_nodup.bw \ + --outFileFormat bigwig \ + --effectiveGenomeSize 4639675 \ + --normalizeUsing RPGC \ + --skipNonCoveredRegions \ + --extendReads 200 \ + --ignoreDuplicates ``` 5. Do it for the replicate and the control. @@ -562,10 +568,16 @@ This prints the help of the program. * &> MACS.out will output the verbosity (=information) in the file MACS.out ```{bash eval=FALSE} -macs2 callpeak -t ../../02-Mapping/bam/SRR576933.bam \ --c ../../02-Mapping/bam/SRR576938.bam --format BAM \ ---gsize 4639675 --name 'FNR_Anaerobic_A' --bw 400 \ ---fix-bimodal -p 1e-2 &> repA_MACS.out +macs2 callpeak \ + -t ../../02-Mapping/bam/SRR576933.bam \ + -c ../../02-Mapping/bam/SRR576938.bam \ + --format BAM \ + --gsize 4639675 \ + --name 'FNR_Anaerobic_A' \ + --bw 400 \ + --fix-bimodal \ + -p 1e-2 \ + &> repA_MACS.out ``` 3. Run macs2 for replicate A, then go to repB directory and run macs2 for replicate B by changing the treatment file (-t), the output file name (-n) and the log file (&>), this should take a few minutes each. @@ -579,10 +591,17 @@ mkdir pool cd pool # Run macs2 for pooled replicates -macs2 callpeak -t ../../02-Mapping/bam/SRR576933.bam ../../02-Mapping/bam/SRR576934.bam \ --c ../../02-Mapping/bam/SRR576938.bam --format BAM \ ---gsize 4639675 --name 'FNR_Anaerobic_pool' --bw 400 \ ---fix-bimodal -p 1e-2 &> pool_MACS.out +macs2 callpeak \ + -t ../../02-Mapping/bam/SRR576933.bam \ + ../../02-Mapping/bam/SRR576934.bam \ + -c ../../02-Mapping/bam/SRR576938.bam \ + --format BAM \ + --gsize 4639675 \ + --name 'FNR_Anaerobic_pool' \ + --bw 400 \ + --fix-bimodal \ + -p 1e-2 \ + &> pool_MACS.out ``` ## - Analyzing MACS results @@ -639,10 +658,13 @@ idr --help 3. Run idr ```{bash eval=FALSE} -idr --samples ../replicates/FNR_Anaerobic_A_peaks.narrowPeak ../replicates/FNR_Anaerobic_B_peaks.narrowPeak \ ---peak-list ../pool/FNR_Anaerobic_pool_peaks.narrowPeak \ ---input-file-type narrowPeak --output-file FNR_anaerobic_idr_peaks.bed \ ---plot +idr \ + --samples ../replicates/FNR_Anaerobic_A_peaks.narrowPeak \ + ../replicates/FNR_Anaerobic_B_peaks.narrowPeak \ + --peak-list ../pool/FNR_Anaerobic_pool_peaks.narrowPeak \ + --input-file-type narrowPeak \ + --output-file FNR_anaerobic_idr_peaks.bed \ + --plot ``` 4. Remove IDR and MACS2 from your environment and go back to working home directory (i.e /shared/projects//EBAII2023_chipseq) @@ -712,8 +734,10 @@ samtools faidx ../data/Escherichia_coli_K12.fasta ## First load bedtools module load bedtools/2.30.0 ## Extract fasta sequence from genomic coordinate of peaks -bedtools getfasta -fi ../data/Escherichia_coli_K12.fasta \ --bed ../05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed -fo FNR_anaerobic_idr_peaks.fa +bedtools getfasta \ + -fi ../data/Escherichia_coli_K12.fasta \ + -bed ../05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed \ + -fo FNR_anaerobic_idr_peaks.fa ``` 4. Download the file FNR_anaerobic_idr_peaks.fa on your computer @@ -737,12 +761,18 @@ bedtools getfasta -fi ../data/Escherichia_coli_K12.fasta \ ## - OPTIONAL : Motif discovery with RSAT (short peaks) 1. Restrict the dataset to the summit of the peaks +/- 100bp using bedtools slop. Using bedtools slop to extend genomic coordinates allow not to go beyond chromosome boundaries as the user give the size of chromosomes as input (see fai file). ```{bash eval=FALSE} -bedtools slop -b 100 -i ../05-PeakCalling/replicates/FNR_Anaerobic_A_summits.bed -g ../data/Escherichia_coli_K12.fasta.fai > FNR_Anaerobic_A_summits+-100.bed +bedtools slop \ + -b 100 \ + -i ../05-PeakCalling/replicates/FNR_Anaerobic_A_summits.bed \ + -g ../data/Escherichia_coli_K12.fasta.fai \ + > FNR_Anaerobic_A_summits+-100.bed ``` 2. Extract the sequences for this BED file ```{bash eval=FALSE} ## Extract fasta sequence from genomic coordinate of peaks -bedtools getfasta -fi ../data/Escherichia_coli_K12.fasta -bed FNR_Anaerobic_A_summits+-100.bed -fo FNR_Anaerobic_A_summits+-100.fa +bedtools getfasta \ + -fi ../data/Escherichia_coli_K12.fasta -bed FNR_Anaerobic_A_summits+-100.bed \ + -fo FNR_Anaerobic_A_summits+-100.fa ## Compress the genome file as we won't need it anymore gzip ../data/Escherichia_coli_K12.fasta diff --git a/2023/ebaiin1/chip-seq/hands-on/hands-on.html b/2023/ebaiin1/chip-seq/hands-on/hands-on.html index b733e6c..a600350 100644 --- a/2023/ebaiin1/chip-seq/hands-on/hands-on.html +++ b/2023/ebaiin1/chip-seq/hands-on/hands-on.html @@ -1517,7 +1517,6 @@

2023 Novembre 7-9

-

\(^1\) GenomEast platform, IGBMC

1 - Introduction

@@ -1642,8 +1641,8 @@

3.1 - Sign in to
  • Go to Jupyterhub
  • -
  • select reservation: 2325_ebaii_0, CPUS: -4 and Memory (in GB): 10 alt text
  • +
  • select CPUS: 4 and Memory (in GB): +10
  • Click on “Start”
  • In the launcher, click on “Terminal” in “Other” section. You should be in your home directory by default. Check it:
  • @@ -1834,7 +1833,9 @@

    5.3 - Prepare the index
  • Build the index for bowtie
  • ## Creating genome index : provide the path to the genome file and the name to give to the index (Escherichia_coli_K12)
    -bowtie-build ../../data/Escherichia_coli_K12.fasta Escherichia_coli_K12
    +bowtie-build \ + ../../data/Escherichia_coli_K12.fasta \ + Escherichia_coli_K12
    1. Go back to upper directory i.e 02-Mapping
    @@ -1959,26 +1960,22 @@

    6 - Estimating the number
  • INPUT: input file name to mark for duplicate reads
  • OUTPUT: output file name
  • METRICS: file with duplicates marking statistics
  • -
  • VALIDATION_STRINGENCY: Validation stringency for all SAM files read -by picard.
  • ## Load picard
     module load picard/2.23.5
     
     ## Run picard
     picard MarkDuplicates \
    -CREATE_INDEX=true \
    -INPUT=SRR576933.bam \
    -OUTPUT=Marked_SRR576933.bam \
    -METRICS_FILE=metrics_SRR576933.txt \
    -VALIDATION_STRINGENCY=STRICT
    +  -CREATE_INDEX true \
    +  -INPUT SRR576933.bam \
    +  -OUTPUT Marked_SRR576933.bam \
    +  -METRICS_FILE metrics_SRR576933.txt
     
     picard MarkDuplicates \
    -CREATE_INDEX=true \
    -INPUT=SRR576938.bam \
    -OUTPUT=Marked_SRR576938.bam \
    -METRICS_FILE=metrics_SRR576938.txt \
    -VALIDATION_STRINGENCY=STRICT
    + -CREATE_INDEX true \ + -INPUT SRR576938.bam \ + -OUTPUT Marked_SRR576938.bam \ + -METRICS_FILE metrics_SRR576938.txt

    To determine the number of duplicated reads marked by Picard, we can run the samtools flagstat command:

    ## Add samtools to your environment
    @@ -2027,7 +2024,12 @@ 

    7.1 - Plot the Lorenz
    ## Load deeptools in your environment
     module load deeptools/3.5.0
     ## Run deeptools fingerprint
    -plotFingerprint --numberOfSamples 10000 -b ../02-Mapping/bam/SRR576933.bam ../02-Mapping/bam/SRR576934.bam ../02-Mapping/bam/SRR576938.bam -plot fingerprint_10000.png
    +plotFingerprint \ + --numberOfSamples 10000 \ + -b ../02-Mapping/bam/SRR576933.bam \ + ../02-Mapping/bam/SRR576934.bam \ + ../02-Mapping/bam/SRR576938.bam \ + -plot fingerprint_10000.png

    1. If plotFingerprint takes ages to run. Take the file that has already been prepared for the training.
    2. @@ -2161,8 +2163,13 @@

      8.3 - Viewing scaled position will be considered only once
      bamCoverage --bam ../02-Mapping/bam/Marked_SRR576933.bam \
      ---outFileName SRR576933_nodup.bw --outFileFormat bigwig --effectiveGenomeSize 4639675 \
      ---normalizeUsing RPGC --skipNonCoveredRegions --extendReads 200 --ignoreDuplicates
      + --outFileName SRR576933_nodup.bw \ + --outFileFormat bigwig \ + --effectiveGenomeSize 4639675 \ + --normalizeUsing RPGC \ + --skipNonCoveredRegions \ + --extendReads 200 \ + --ignoreDuplicates
      1. Do it for the replicate and the control.
      2. Download the three bigwig files you have just generated
      3. @@ -2274,10 +2281,16 @@

        9.2 - Calling the
      4. &> MACS.out will output the verbosity (=information) in the file MACS.out
      5. -
        macs2 callpeak -t ../../02-Mapping/bam/SRR576933.bam \
        --c ../../02-Mapping/bam/SRR576938.bam --format BAM \
        ---gsize 4639675 --name 'FNR_Anaerobic_A' --bw 400 \
        ---fix-bimodal -p 1e-2 &> repA_MACS.out
        +
        macs2 callpeak \
        +  -t ../../02-Mapping/bam/SRR576933.bam \
        +  -c ../../02-Mapping/bam/SRR576938.bam \
        +  --format BAM \
        +  --gsize 4639675 \
        +  --name 'FNR_Anaerobic_A' \
        +  --bw 400 \
        +  --fix-bimodal \
        +  -p 1e-2 \
        +  &> repA_MACS.out
        1. Run macs2 for replicate A, then go to repB directory and run macs2 for replicate B by changing the treatment file (-t), the output @@ -2294,10 +2307,17 @@

          9.2 - Calling the cd pool # Run macs2 for pooled replicates -macs2 callpeak -t ../../02-Mapping/bam/SRR576933.bam ../../02-Mapping/bam/SRR576934.bam \ --c ../../02-Mapping/bam/SRR576938.bam --format BAM \ ---gsize 4639675 --name 'FNR_Anaerobic_pool' --bw 400 \ ---fix-bimodal -p 1e-2 &> pool_MACS.out +macs2 callpeak \ + -t ../../02-Mapping/bam/SRR576933.bam \ + ../../02-Mapping/bam/SRR576934.bam \ + -c ../../02-Mapping/bam/SRR576938.bam \ + --format BAM \ + --gsize 4639675 \ + --name 'FNR_Anaerobic_pool' \ + --bw 400 \ + --fix-bimodal \ + -p 1e-2 \ + &> pool_MACS.out

    9.3 - Analyzing MACS @@ -2362,10 +2382,13 @@

    9.4 - Combine replicate
    1. Run idr
    -
    idr --samples ../replicates/FNR_Anaerobic_A_peaks.narrowPeak ../replicates/FNR_Anaerobic_B_peaks.narrowPeak \
    ---peak-list ../pool/FNR_Anaerobic_pool_peaks.narrowPeak \
    ---input-file-type narrowPeak --output-file FNR_anaerobic_idr_peaks.bed \
    ---plot
    +
    idr \
    +  --samples ../replicates/FNR_Anaerobic_A_peaks.narrowPeak \
    +    ../replicates/FNR_Anaerobic_B_peaks.narrowPeak \
    +  --peak-list ../pool/FNR_Anaerobic_pool_peaks.narrowPeak \
    +  --input-file-type narrowPeak \
    +  --output-file FNR_anaerobic_idr_peaks.bed \
    +  --plot
    1. Remove IDR and MACS2 from your environment and go back to working home directory (i.e @@ -2441,8 +2464,10 @@

      10.1 - Retrieve the peak ## First load bedtools module load bedtools/2.30.0 ## Extract fasta sequence from genomic coordinate of peaks -bedtools getfasta -fi ../data/Escherichia_coli_K12.fasta \ --bed ../05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed -fo FNR_anaerobic_idr_peaks.fa +bedtools getfasta \ + -fi ../data/Escherichia_coli_K12.fasta \ + -bed ../05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed \ + -fo FNR_anaerobic_idr_peaks.fa
      1. Download the file FNR_anaerobic_idr_peaks.fa on your computer
      @@ -2510,12 +2535,18 @@

      10.3 - OPTIONAL : Motif not to go beyond chromosome boundaries as the user give the size of chromosomes as input (see fai file).

    -
    bedtools slop -b 100 -i ../05-PeakCalling/replicates/FNR_Anaerobic_A_summits.bed -g ../data/Escherichia_coli_K12.fasta.fai > FNR_Anaerobic_A_summits+-100.bed
    +
    bedtools slop \
    +  -b 100 \
    +  -i ../05-PeakCalling/replicates/FNR_Anaerobic_A_summits.bed \
    +  -g ../data/Escherichia_coli_K12.fasta.fai \
    +  > FNR_Anaerobic_A_summits+-100.bed
    1. Extract the sequences for this BED file
    ## Extract fasta sequence from genomic coordinate of peaks
    -bedtools getfasta -fi ../data/Escherichia_coli_K12.fasta -bed FNR_Anaerobic_A_summits+-100.bed -fo FNR_Anaerobic_A_summits+-100.fa
    +bedtools getfasta \
    +  -fi ../data/Escherichia_coli_K12.fasta -bed FNR_Anaerobic_A_summits+-100.bed \
    +  -fo FNR_Anaerobic_A_summits+-100.fa
     
     ## Compress the genome file as we won't need it anymore
     gzip ../data/Escherichia_coli_K12.fasta
    @@ -2809,8 +2840,8 @@

    12.2.5 - How does the # compute the density of peaks within the promoter regions tagMatrix = getTagMatrix(peaks.limb, windows=promoter) -
    ## >> preparing start_site regions by ... 2023-11-02 15:34:46
    -## >> preparing tag matrix...  2023-11-02 15:34:46
    +
    ## >> preparing start_site regions by ... 2023-11-06 23:07:56
    +## >> preparing tag matrix...  2023-11-06 23:07:56
    # plot the density
     tagHeatmap(tagMatrix, xlim=c(-5000, 5000), color="red")

    @@ -2825,32 +2856,32 @@

    12.3 - Functional peak files with the annotation file of the mouse genome. This function returns a complex object which contains all this information.

    peakAnno.forebrain = annotatePeak(peaks.forebrain, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Mm.eg.db")
    -
    ## >> preparing features information...      2023-11-02 15:35:11 
    -## >> identifying nearest features...        2023-11-02 15:35:12 
    -## >> calculating distance from peak to TSS...   2023-11-02 15:35:12 
    -## >> assigning genomic annotation...        2023-11-02 15:35:12 
    -## >> adding gene annotation...          2023-11-02 15:35:19
    +
    ## >> preparing features information...      2023-11-06 23:08:21 
    +## >> identifying nearest features...        2023-11-06 23:08:21 
    +## >> calculating distance from peak to TSS...   2023-11-06 23:08:21 
    +## >> assigning genomic annotation...        2023-11-06 23:08:21 
    +## >> adding gene annotation...          2023-11-06 23:08:28
    ## 'select()' returned 1:many mapping between keys and columns
    -
    ## >> assigning chromosome lengths           2023-11-02 15:35:19 
    -## >> done...                    2023-11-02 15:35:19
    +
    ## >> assigning chromosome lengths           2023-11-06 23:08:28 
    +## >> done...                    2023-11-06 23:08:28
    peakAnno.midbrain = annotatePeak(peaks.midbrain, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Mm.eg.db")
    -
    ## >> preparing features information...      2023-11-02 15:35:19 
    -## >> identifying nearest features...        2023-11-02 15:35:19 
    -## >> calculating distance from peak to TSS...   2023-11-02 15:35:19 
    -## >> assigning genomic annotation...        2023-11-02 15:35:19 
    -## >> adding gene annotation...          2023-11-02 15:35:20
    +
    ## >> preparing features information...      2023-11-06 23:08:28 
    +## >> identifying nearest features...        2023-11-06 23:08:28 
    +## >> calculating distance from peak to TSS...   2023-11-06 23:08:28 
    +## >> assigning genomic annotation...        2023-11-06 23:08:28 
    +## >> adding gene annotation...          2023-11-06 23:08:29
    ## 'select()' returned 1:1 mapping between keys and columns
    -
    ## >> assigning chromosome lengths           2023-11-02 15:35:20 
    -## >> done...                    2023-11-02 15:35:20
    +
    ## >> assigning chromosome lengths           2023-11-06 23:08:29 
    +## >> done...                    2023-11-06 23:08:29
    peakAnno.limb = annotatePeak(peaks.limb, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Mm.eg.db")
    -
    ## >> preparing features information...      2023-11-02 15:35:20 
    -## >> identifying nearest features...        2023-11-02 15:35:20 
    -## >> calculating distance from peak to TSS...   2023-11-02 15:35:20 
    -## >> assigning genomic annotation...        2023-11-02 15:35:20 
    -## >> adding gene annotation...          2023-11-02 15:35:20
    +
    ## >> preparing features information...      2023-11-06 23:08:29 
    +## >> identifying nearest features...        2023-11-06 23:08:29 
    +## >> calculating distance from peak to TSS...   2023-11-06 23:08:29 
    +## >> assigning genomic annotation...        2023-11-06 23:08:29 
    +## >> adding gene annotation...          2023-11-06 23:08:30
    ## 'select()' returned 1:many mapping between keys and columns
    -
    ## >> assigning chromosome lengths           2023-11-02 15:35:21 
    -## >> done...                    2023-11-02 15:35:21
    +
    ## >> assigning chromosome lengths           2023-11-06 23:08:30 
    +## >> done...                    2023-11-06 23:08:30

    12.3.1 - genomic localization