-
Notifications
You must be signed in to change notification settings - Fork 29
/
umi.wdl
107 lines (92 loc) · 4.65 KB
/
umi.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
version 1.0
# Copyright (c) 2022 Leiden University Medical Center
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
task BamReadNameToUmiTag {
# This task processes a bam file with reads that have been extracted with
# umi-tools extract. The UMI is extracted from the read name again and put
# in the bam file again with umiTag (default RX)
input {
File inputBam
String outputPath = "output.bam"
String umiTag = "RX"
String separatorChar = "_"
String memory = "2GiB"
Int timeMinutes = 1 + ceil(size([inputBam], "GiB") * 10)
String dockerImage = "quay.io/biocontainers/pysam:0.17.0--py39h051187c_0"
}
String bamIndexPath = sub(select_first([outputPath]), "\.bam$", ".bai")
command <<<
python <<CODE
import pysam
import sys
import os
from typing import Tuple
def split_umi_from_name(name, separator_char = "_") -> Tuple[str, str]:
id_and_rest = name.split(maxsplit=1)
id = id_and_rest[0]
# If there was no whitespace id_and_rest will have length 1
other_parts = id_and_rest[1] if len(id_and_rest) == 2 else ""
underscore_index = id.rfind(separator_char)
umi = id[underscore_index + 1:]
new_id = id[:underscore_index]
if other_parts:
return " ".join([new_id, other_parts]), umi
return new_id, umi
def annotate_umis(in_file, out_file, bam_tag="RX", separator_char = "_"):
in_bam = pysam.AlignmentFile(in_file, "rb")
os.makedirs(os.path.dirname(out_file), exist_ok=True)
out_bam = pysam.AlignmentFile(out_file, "wb", template=in_bam)
# Encode bam_tag as bytes. Otherwise pysam converts it to bytes anyway.
encoded_bam_tag = bam_tag.encode('ascii')
for segment in in_bam: # type: pysam.AlignedSegment
new_name, umi = split_umi_from_name(segment.query_name, separator_char)
segment.query_name = new_name
# Encode umi as ascii. Otherwise pysam encodes it to bytes anyway.
# Value type has to be a string though, otherwise pysam crashes.
segment.set_tag(encoded_bam_tag, umi.encode('ascii'), value_type="Z")
out_bam.write(segment)
if __name__ == "__main__":
annotate_umis("~{inputBam}", "~{outputPath}", "~{umiTag}", "~{separatorChar}")
pysam.index("~{outputPath}", "~{bamIndexPath}", b=True)
CODE
>>>
output {
File outputBam = outputPath
File outputBamIndex = bamIndexPath
}
runtime {
memory: memory
time_minutes: timeMinutes
docker: dockerImage
}
parameter_meta {
# inputs
inputBam: {description: "The input SAM file.", category: "required"}
outputPath: {description: "Output directory path + output file.", category: "common"}
umiTag: {description: "The tag used for UMIs in the output BAM file.", category: "common"}
separatorChar: {description: "Character used to separate the UMIs from the read name.", category: "common"}
memory: {description: "The amount of memory available to the job.", category: "advanced"}
timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}
# outputs
outputBam: {description: "Sorted BAM file."}
outputBamIndex: {description: "Sorted BAM file index."}
}
}