Skip to content
This repository has been archived by the owner on Dec 18, 2023. It is now read-only.

Commit

Permalink
Merge branch 'yaml_config' into 'master'
Browse files Browse the repository at this point in the history
Automatically generate yaml config

See merge request !2
  • Loading branch information
mdehollander committed Mar 13, 2017
2 parents 125b6fa + 628dbc9 commit 0b78dde
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 43 deletions.
50 changes: 7 additions & 43 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from snakemake.utils import R

configfile: "config.json"
configfile: "config.yaml"

PROJECT = config["project"] + "/"

Expand All @@ -13,8 +13,8 @@ rule final:

rule unpack_and_rename:
input:
forward = lambda wildcards: config["data"][wildcards.data]['forward'],
reverse = lambda wildcards: config["data"][wildcards.data]['reverse']
forward = lambda wildcards: config["data"][wildcards.data]["path"][0],
reverse = lambda wildcards: config["data"][wildcards.data]["path"][1]
output:
forward="{project}/gunzip/{data}_R1.fastq",
reverse="{project}/gunzip/{data}_R2.fastq"
Expand Down Expand Up @@ -57,7 +57,7 @@ rule pandaseq:
log: "{project}/pandaseq/{data}_pandaseq.stdout"
threads: 1
conda: "envs/pandaseq.yaml"
shell: "pandaseq -N -f {input.forward} -r {input.reverse} -p {params.forward_primer} -q {params.reverse_primer} -A rdp_mle -T {threads} -w {output.fasta} -g {log}"
shell: "pandaseq -N -f {input.forward} -r {input.reverse} -p {params.forward_primer} -q {params.reverse_primer} -T {threads} -w {output.fasta} -g {log}"

rule fastqc_pandaseq:
input:
Expand All @@ -73,29 +73,6 @@ rule fastqc_pandaseq:
shell: "fastqc -q -t {threads} --contaminants {params.adapters} --outdir {params.dir} {input.fastq} > {params.dir}/{log}"


#Obsolete
rule primer_matching:
input:
"{project}/pandaseq/{data}.fastq"
output:
"{project}/flexbar/{data}.fastq"
params:
prefix_forward="flexbar_{data}_forward",
prefix_reverse="flexbar_{data}_reverse"
log: "{project}/trim/flexbar_{data}.log"
run:
shell("/data/tools/flexbar/2.5/flexbar -t {params.prefix_forward} -b primers.fasta -r {input} --barcode-min-overlap 10 --barcode-threshold 3 --min-read-length 50 >> {log}")
shell("cat {params.prefix_forward}* | fastx_reverse_complement | /data/tools/flexbar/2.5/flexbar -t {params.prefix_reverse} -b primers.fasta --reads - --barcode-trim-end LEFT --barcode-min-overlap 10 --barcode-threshold 3 --min-read-length 50 --barcode-unassigned >> {log}")
shell("cat {params.prefix_reverse}* > {output}")

#Obsolete
rule fastq2fasta:
input:
fastq = "{project}/pandaseq/{data}.fastq"
output:
fasta = "{project}/pandaseq/{data}.fasta"
shell: "fastq_to_fasta -i {input.fastq} -o {output.fasta}"

# Combine per sample files to a single project file
rule mergefiles:
input:
Expand Down Expand Up @@ -172,14 +149,6 @@ rule cluster_fast:
cmd = "usearch"
shell("{cmd} --cluster_fast {input} --usersort -centroids {output.otus} --id 0.97 -sizeout")

# Not longer supported since it is not in bioconda
rule uparse:
input:
"{project}/{prog}/{ds}.sorted.minsize{minsize}.fasta"
output:
otus=protected("{project}/{prog}/{ds}.minsize{minsize}.uparse.fasta")
shell: "{USEARCH} -cluster_otus {input} -otus {output.otus} -relabel OTU_ -sizeout"


#
# Chimera checking
Expand Down Expand Up @@ -246,13 +215,6 @@ rule biom_otu:
#
# Taxonomy
#
SINA_VERSION=config['sina_version']
SILVADB = config['silva_db']
SINA_MIN_SIM = config['sina_min_sim']
SILVA_ARB = config['silva_arb']



rule sina_parallel_edgar:
input:
"{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.fasta"
Expand All @@ -261,12 +223,14 @@ rule sina_parallel_edgar:
#aligncsv="{project}/{prog}/sina/{ds}.minsize{minsize}.{clmethod}.sina.{chunk}.align.csv",
log="{project}/{prog}/sina/{ds}.minsize{minsize}.{clmethod}.sina.log",
align="{project}/{prog}/sina/{ds}.minsize{minsize}.{clmethod}.sina.align",
params:
silva_arb = config["silva_arb"]
log: "{project}/{prog}/sina/{ds}.minsize{minsize}.{clmethod}.sina.log"
priority: -1
threads: 8
# TODO: turn is set to all to get classification. Reverse the reads in earlier stage!
conda: "envs/sina.yaml"
shell: "cat {input} | parallel --block 1000K -j{threads} --recstart '>' --pipe sina --log-file {log} -i /dev/stdin -o {output.align} --outtype fasta --meta-fmt csv --ptdb {SILVA_ARB} --overhang remove --turn all --search --search-db {SILVA_ARB} --search-min-sim 0.95 --search-no-fast --search-kmer-len 10 --lca-fields tax_slv"
shell: "cat {input} | parallel --block 1000K -j{threads} --recstart '>' --pipe sina --log-file {log} -i /dev/stdin -o {output.align} --outtype fasta --meta-fmt csv --ptdb {params.silva_arb} --overhang remove --turn all --search --search-db {params.silva_arb} --search-min-sim 0.95 --search-no-fast --search-kmer-len 10 --lca-fields tax_slv"

rule sina_get_taxonomy_from_logfile_edgar:
input:
Expand Down
93 changes: 93 additions & 0 deletions conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import logging
import multiprocessing
import re
import os
import tempfile
import yaml
from collections import OrderedDict

# Adapted from: https://github.com/pnnl/atlas/blob/master/atlas/conf.py

# http://stackoverflow.com/a/3675423
def replace_last(source_string, replace_what, replace_with):
head, _sep, tail = source_string.rpartition(replace_what)
if _sep == '':
return tail
else:
return head + replace_with + tail

def get_sample_files(path):
samples = OrderedDict()
seen = set()
for dir_name, sub_dirs, files in os.walk(path):
for fname in files:

if ".fastq" in fname or ".fq" in fname:

sample_id = fname.partition(".fastq")[0]
if ".fq" in sample_id:
sample_id = fname.partition(".fq")[0]

sample_id = sample_id.replace("_R1", "").replace("_r1", "").replace("_R2", "").replace("_r2", "")
sample_id = re.sub("_1$", "", sample_id)
sample_id = re.sub("_2$", "", sample_id)
sample_id = sample_id.replace("_", "-").replace(" ", "-")

fq_path = os.path.join(dir_name, fname)
fastq_paths = [fq_path]

if fq_path in seen: continue

if "_R1" in fname or "_r1" in fname or "_1" in fname:
fname = replace_last(fname,"_1.","_2.")
r2_path = os.path.join(dir_name, fname.replace("_R1", "_R2").replace("_r1", "_r2"))
if not r2_path == fq_path:
seen.add(r2_path)
fastq_paths.append(r2_path)

if "_R2" in fname or "_r2" in fname or "_2" in fname:
fname = replace_last(fname,"_2.","_1.")
r1_path = os.path.join(dir_name, fname.replace("_R2", "_R1").replace("_r2", "_r1"))
if not r1_path == fq_path:
seen.add(r1_path)
fastq_paths.insert(0, r1_path)

if sample_id in samples:
logging.warn("Duplicate sample %s was found after renaming; skipping..." % sample_id)
continue
samples[sample_id] = {'path': fastq_paths }
return samples


def make_config(config, path):
"""Write the file `config` and complete the sample names and paths for all files in `path`."""
represent_dict_order = lambda self, data: self.represent_mapping('tag:yaml.org,2002:map', data.items())
yaml.add_representer(OrderedDict, represent_dict_order)
path = os.path.realpath(path)

conf = OrderedDict()
samples = get_sample_files(path)

logging.info("Found %d samples under %s" % (len(samples), path))
conf["project"] = "My-Project"
conf["adapters_fasta"] = "/data/ngs/adapters/contaminant_list.txt"
conf["pandaseq_overlap"] = "10"
conf["pandaseq_quality"] = "25"
conf["pandaseq_minlength"] = "100"
conf["pandaseq_maxlength"] = "700"

conf["forward_primer"] = "CCTACGGGNGGCWGCAG"
conf["reverse_primer"] = "GACTACHVGGGTATCTAATCC"

conf["silva_arb"] = "/data/db/Silva/128/SSURef_NR99_128_SILVA_07_09_16_opt.arb"
conf["metadata"] = "data/metadata.txt"

conf["data"] = samples

with open(config, "w") as f:
print(yaml.dump(conf, default_flow_style=False), file=f)
logging.info("Configuration file written to %s" % config)

if __name__ == "__main__":
make_config(config="config.yaml", path="data")

Binary file removed uparse_scripts/die.pyc
Binary file not shown.
Binary file removed uparse_scripts/fasta.pyc
Binary file not shown.
Binary file removed uparse_scripts/progress.pyc
Binary file not shown.
Binary file removed uparse_scripts/uc.pyc
Binary file not shown.

0 comments on commit 0b78dde

Please sign in to comment.