Skip to content

Commit

Permalink
parser: support regex style begin-end anchors (^, $)
Browse files Browse the repository at this point in the history
These are the only anchors supported by Hyperscan, making it easier to
write handler patterns that should match exclusively at the beginning
or at the end of a stream.

The exception mapping has also been changed in `hexstring2regex`, as
anchors are processed during lexing, and it raises a different
exception then what is raised during parsing. Fortunately lark has a
nice exception base class to use.
  • Loading branch information
vlaci committed Jan 25, 2024
1 parent 3c61aba commit 86a90c4
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 5 deletions.
4 changes: 4 additions & 0 deletions docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,10 @@ PATTERNS = [
]
```

In addition, start and end of input anchors (`^` and `$` like in regular
expressions) can also be used to restrict a match to the beginning or the end of
the input file.

### DirectoryPatterns

The `DirectoryHandler` uses these patterns to identify the starting/main file of a given
Expand Down
16 changes: 14 additions & 2 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
rb"\x00(\x01|\x02(\x03|\x04))\x05",
id="nested-alternative",
),
pytest.param("^ 00", rb"^\x00", id="start-anchor"),
pytest.param("00 $", rb"\x00$", id="end-anchor"),
],
)
def test_simple_convert(hex_string, expected_regex):
Expand Down Expand Up @@ -72,6 +74,16 @@ def test_single_comment():
assert regex == rb"\x01\x02"


def test_invalid_hexstring():
@pytest.mark.parametrize(
"pattern",
[
pytest.param("invalid hexstring", id="invalid"),
pytest.param("00 ^", id="start-anchor-at-end"),
pytest.param("00 ^ 01", id="start-anchor-at-middle"),
pytest.param("$ 00", id="end-anchor-at-start"),
pytest.param("00 $ 01", id="end-anchor-at-middle"),
],
)
def test_invalid_hexstring(pattern):
with pytest.raises(InvalidHexString):
hexstring2regex("invalid hexstring")
hexstring2regex(pattern)
8 changes: 5 additions & 3 deletions unblob/parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import itertools

from lark.exceptions import UnexpectedCharacters
from lark.exceptions import UnexpectedInput
from lark.lark import Lark
from lark.visitors import Transformer

Expand All @@ -16,7 +16,7 @@
%ignore COMMENT
start: item+
start: START_ANCHOR? item+ END_ANCHOR?
item: LITERAL -> literal
| WILDCARD -> wildcard
Expand All @@ -29,6 +29,8 @@
alternative: "(" item+ (ALTERNATIVE_SEPARATOR item+)+ ")"
ALTERNATIVE_SEPARATOR: "|"
LITERAL: HEXDIGIT HEXDIGIT
START_ANCHOR: "^"
END_ANCHOR: "$"
WILDCARD: "??"
FIRSTNIBLE: "?" HEXDIGIT
SECONDNIBLE: HEXDIGIT "?"
Expand Down Expand Up @@ -89,6 +91,6 @@ class InvalidHexString(ValueError):
def hexstring2regex(hexastr):
try:
parsed = _hex_string_parser.parse(hexastr)
except UnexpectedCharacters as e:
except UnexpectedInput as e:
raise InvalidHexString(str(e)) from e
return _HexStringToRegex().transform(parsed)

0 comments on commit 86a90c4

Please sign in to comment.