Skip to content

Commit

Permalink
Merge pull request #18204 from martenson/tight-axt
Browse files Browse the repository at this point in the history
[24.1] Tighten axt sniffer
  • Loading branch information
mvdbeek authored May 29, 2024
2 parents 8ed752a + 706ae5f commit f0d4f52
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 83 deletions.
2 changes: 1 addition & 1 deletion lib/galaxy/config/sample/datatypes_conf.xml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
<datatype extension="gfa2" auto_compressed_types="gz" type="galaxy.datatypes.text:Gfa2" mimetype="text/plain" display_in_upload="true"/>
<datatype extension="asn1" type="galaxy.datatypes.data:GenericAsn1" mimetype="text/plain" display_in_upload="true"/>
<datatype extension="asn1-binary" type="galaxy.datatypes.binary:GenericAsn1Binary" mimetype="application/octet-stream" display_in_upload="true"/>
<datatype extension="axt" type="galaxy.datatypes.sequence:Axt" display_in_upload="true" description="blastz pairwise alignment format. Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Axt"/>
<datatype extension="axt" type="galaxy.datatypes.sequence:Axt" display_in_upload="true" description="A pairwise alignment format." description_url="https://genome.ucsc.edu/goldenPath/help/axt.html"/>
<datatype extension="fli" type="galaxy.datatypes.tabular:FeatureLocationIndex" display_in_upload="false"/>
<datatype extension="bam" type="galaxy.datatypes.binary:Bam" mimetype="application/octet-stream" display_in_upload="true" description="A binary file compressed in the BGZF format with a '.bam' file extension." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#BAM">
<converter file="bam_to_bai.xml" target_datatype="bai"/>
Expand Down
106 changes: 42 additions & 64 deletions lib/galaxy/datatypes/chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from galaxy.datatypes.sniff import (
build_sniff_from_prefix,
FilePrefix,
get_headers,
)
from galaxy.util import (
commaify,
Expand Down Expand Up @@ -91,41 +92,31 @@ def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
>>> fname = get_test_fname( '1.chain' )
>>> Chain().sniff( fname )
True
>>> fname = get_test_fname( '2.chain' )
>>> Chain().sniff( fname )
True
>>>
"""
fh = file_prefix.string_io()
for line in fh:
line = line.strip()
if line: # first non-empty line
if line.startswith("chain"):
tokens = line.split()
if not (
len(tokens) in [12, 13]
and tokens[4] in self.strands
and tokens[9] in self.strands
and tokens[3].isdigit()
and tokens[5].isdigit()
and tokens[6].isdigit()
):
return False
prior_token_len = 0
for line in fh:
line = line.strip()
if line == "":
break
tokens = line.split()
if prior_token_len == 1:
return False
if len(tokens) not in [1, 3]:
return False
if not all(token.isdigit() for token in tokens):
return False
prior_token_len = len(tokens)
if prior_token_len == 1:
return True
else:
return False
return False
headers = get_headers(file_prefix, None, count=2, comment_designator="#")
if not (
len(headers) == 2
and len(headers[0]) in [12, 13]
and headers[0][0] == "chain"
and headers[0][1].isdecimal()
and headers[0][3].isdecimal()
and headers[0][4] in self.strands
and headers[0][5].isdecimal()
and headers[0][6].isdecimal()
and headers[0][8].isdecimal()
and headers[0][9] in self.strands
and headers[0][10].isdecimal()
and headers[0][11].isdecimal()
and headers[1][0].isdecimal()
and len(headers[1]) in [1, 3]
):
return False
else:
return True


@build_sniff_from_prefix
Expand Down Expand Up @@ -161,34 +152,21 @@ def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
allowed_classes = ["fill", "gap"]
strands = ["+", "-"]

fh = file_prefix.string_io()
for line in fh:
line = line.strip()
if line: # first non-empty line
if line.startswith("net"):
tokens = line.split()
if not (len(tokens) == 3 and tokens[2].isdigit()):
return False
for line in fh:
if line[0] != " ": # children are indented one space
return False
line = line.strip()
if line == "":
break
tokens = line.split()
if not (
len(tokens) >= 7 # seven fixed fields
and len(tokens) <= 41 # plus seventeen optional name/value pairs
and tokens[0] in allowed_classes
and tokens[1].isdigit()
and tokens[2].isdigit()
and tokens[4] in strands
and tokens[5].isdigit()
and tokens[6].isdigit()
):
return False
else:
return True
else:
return False
return False
headers = get_headers(file_prefix, None, count=2, comment_designator="#")
if not (
len(headers) == 2
and len(headers[0]) == 3
and headers[0][0] == "net"
and headers[0][2].isdecimal()
and len(headers[1]) >= 7 # seven fixed fields
and len(headers[1]) <= 41 # plus seventeen optional name/value pairs
and headers[1][0] in allowed_classes
and headers[1][1].isdecimal()
and headers[1][2].isdecimal()
and headers[1][4] in strands
and headers[1][5].isdecimal()
and headers[1][6].isdecimal()
):
return False
else:
return True
40 changes: 22 additions & 18 deletions lib/galaxy/datatypes/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -1205,26 +1205,30 @@ def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
>>> fname = get_test_fname( 'alignment.lav' )
>>> Axt().sniff( fname )
False
>>> fname = get_test_fname( '2.chain' )
>>> Axt().sniff( fname )
False
"""
headers = get_headers(file_prefix, None)
if len(headers) < 4:
headers = get_headers(file_prefix, None, count=4, comment_designator="#")
if not (
len(headers) >= 3
and len(headers[0]) == 9
and headers[0][0] == "0"
and headers[0][2].isdecimal()
and headers[0][3].isdecimal()
and headers[0][5].isdecimal()
and headers[0][6].isdecimal()
and headers[0][7] in data.valid_strand
and headers[0][8].isdecimal()
and len(headers[1]) == 1
and len(headers[2]) == 1
):
return False
for hdr in headers:
if len(hdr) > 0 and hdr[0].startswith("##matrix=axt"):
return True
if len(hdr) > 0 and not hdr[0].startswith("#"):
if len(hdr) != 9:
return False
try:
for _ in (hdr[0], hdr[2], hdr[3], hdr[5], hdr[6], hdr[8]):
int(_)
except ValueError:
return False
if hdr[7] not in data.valid_strand:
return False
else:
return True
return False
# the optional fourth non-comment line has to be empty
if len(headers) == 4 and not headers[3] == []:
return False
else:
return True


@build_sniff_from_prefix
Expand Down
10 changes: 10 additions & 0 deletions lib/galaxy/datatypes/test/2.chain
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
##matrix=axtChain 16 91,-114,-31,-123,-114,100,-125,-31,-31,-125,100,-114,-123,-31,-114,91
##gapPenalties=axtChain O=400 E=30
chain 67224 chr22 50818468 + 26560645 26561468 chr19 61431566 - 54838449 54839272 1
823

chain 48985 chr22 50818468 + 26560497 26561116 chr19 61431566 + 29160089 29160708 2
619

chain 46902 chr22 50818468 + 19792341 19793000 chr19 61431566 + 59180700 59181359 3
659

0 comments on commit f0d4f52

Please sign in to comment.