Skip to content

Commit

Permalink
fix(handlers): add support for unix-compatible (aka v7) tar files.
Browse files Browse the repository at this point in the history
v7 tar headers do not have the 'ustar' magic that we match on for modern
tar files. In order to match on those v7 archive, we build a regular
expression that matches on mode, uid, gid, mtime, size files given their
properties:

- fixed size (e.g. 8 for mode)
- optionally prepended by whitespaces
- suffixed by null bytes (null terminated)
- ASCII encoded octal digits (0x30 to 0x37)

In order to build a pattern that can be handled by hyperscan without
using a notation such as '[\w]{1,99}' for path name (see below for
detailed explanation), we rely on utility function to build a regular
expression to match all possible combination using the or (|) operator.

Note: hyperscan will yield a "Pattern is too large" exception when
trying to use '{1,99}' notation. Even though we found out that using
'.*' works, it would have an important performance impact on pattern
matching. That's why we decided to go with the OR operator approach with
combination.

See https://intel.github.io/hyperscan/dev-reference/compilation.html for
more information about this.
  • Loading branch information
qkaiser committed Oct 23, 2023
1 parent aee811b commit 96a4aff
Show file tree
Hide file tree
Showing 9 changed files with 174 additions and 14 deletions.
80 changes: 77 additions & 3 deletions tests/handlers/archive/test_tar.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
from helpers import unhex

from unblob.file_utils import File
from unblob.handlers.archive.tar import TarHandler, _get_tar_end_offset
from unblob.handlers.archive.tar import (
TarUnixHandler,
TarUstarHandler,
_get_tar_end_offset,
)

GNU_TAR_CONTENTS = unhex(
"""\
Expand Down Expand Up @@ -120,6 +124,58 @@
"""
)

UNIX_TAR_CONTENT = unhex(
"""\
00000000 66 72 75 69 74 73 2f 00 00 00 00 00 00 00 00 00 |fruits/.........|
00000010 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000060 00 00 00 00 30 30 30 30 37 37 35 00 30 30 30 31 |....0000775.0001|
00000070 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000|
00000080 30 30 30 30 30 30 30 00 31 34 35 30 34 32 36 32 |0000000.14504262|
00000090 30 37 37 00 30 30 37 34 30 34 00 20 35 00 00 00 |077.007404. 5...|
000000a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000140 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000|
00000150 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........|
00000160 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000200 66 72 75 69 74 73 2f 61 70 70 6c 65 2e 74 78 74 |fruits/apple.txt|
00000210 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000260 00 00 00 00 30 30 30 30 36 36 34 00 30 30 30 31 |....0000664.0001|
00000270 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000|
00000280 30 30 30 30 30 30 36 00 31 34 35 30 34 32 36 32 |0000006.14504262|
00000290 30 37 31 00 30 31 31 31 35 34 00 20 00 00 00 00 |071.011154. ....|
000002a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000340 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000|
00000350 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........|
00000360 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000400 61 70 70 6c 65 0a 00 00 00 00 00 00 00 00 00 00 |apple...........|
00000410 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000600 66 72 75 69 74 73 2f 63 68 65 72 72 79 2e 74 78 |fruits/cherry.tx|
00000610 74 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |t...............|
00000620 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000660 00 00 00 00 30 30 30 30 36 36 34 00 30 30 30 31 |....0000664.0001|
00000670 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000|
00000680 30 30 30 30 30 30 37 00 31 34 35 30 34 32 36 32 |0000007.14504262|
00000690 30 37 37 00 30 31 31 33 35 36 00 20 00 00 00 00 |077.011356. ....|
000006a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000740 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000|
00000750 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........|
00000760 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000800 63 68 65 72 72 79 0a 00 00 00 00 00 00 00 00 00 |cherry..........|
00000810 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00002800
"""
)

PADDING_TO_DEFAULT_BLOCKING_FACTOR = unhex(
"""\
00000400 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
Expand Down Expand Up @@ -303,12 +359,30 @@ def test_different_blocking_factor():
pytest.param(b"some prefix ", id="nonzero-prefix"),
],
)
def test_calculate_chunk(prefix):
def test_calculate_chunk_ustar(prefix):
tar_file = File.from_bytes(prefix + GNU_TAR_CONTENTS)
handler = TarHandler()
handler = TarUstarHandler()

chunk = handler.calculate_chunk(tar_file, len(prefix))

assert chunk is not None
assert chunk.start_offset == len(prefix)
assert chunk.end_offset == len(prefix) + len(GNU_TAR_CONTENTS)


@pytest.mark.parametrize(
"prefix",
[
pytest.param(b"", id="zero-prefix"),
pytest.param(b"some prefix ", id="nonzero-prefix"),
],
)
def test_calculate_chunk_unix(prefix):
tar_file = File.from_bytes(prefix + UNIX_TAR_CONTENT)
handler = TarUnixHandler()

chunk = handler.calculate_chunk(tar_file, len(prefix))

assert chunk is not None
assert chunk.start_offset == len(prefix)
assert chunk.end_offset == len(prefix) + len(UNIX_TAR_CONTENT)
3 changes: 3 additions & 0 deletions tests/integration/archive/tar/__input__/cherry.v7.tar
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
1 change: 0 additions & 1 deletion unblob/extractors/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def _make_extract_command(self, inpath: Path, outdir: Path):
raise InvalidCommandTemplate("Invalid template placeholder", t) from k
except ValueError as v:
raise InvalidCommandTemplate("The template is malformed", t) from v

return args

def get_dependencies(self) -> List[str]:
Expand Down
3 changes: 2 additions & 1 deletion unblob/handlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@
arc.ARCHandler,
arj.ARJHandler,
cab.CABHandler,
tar.TarHandler,
tar.TarUstarHandler,
tar.TarUnixHandler,
cpio.PortableASCIIHandler,
cpio.PortableASCIIWithCRCHandler,
cpio.PortableOldASCIIHandler,
Expand Down
89 changes: 80 additions & 9 deletions unblob/handlers/archive/tar.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
ExtractResult,
File,
HexString,
Regex,
StructHandler,
ValidChunk,
)
Expand Down Expand Up @@ -98,17 +99,10 @@ def extract(self, inpath: Path, outdir: Path):
return ExtractResult(reports=tarfile.reports)


class TarHandler(StructHandler):
class _TarHandler(StructHandler):
NAME = "tar"

PATTERNS = [
HexString("75 73 74 61 72 20 20 00"),
HexString("75 73 74 61 72 00 30 30"),
]

# Since the magic is at 257, we have to subtract that from the match offset
# to get to the start of the file.
PATTERN_MATCH_OFFSET = -MAGIC_OFFSET
PATTERNS = []

C_DEFINITIONS = r"""
typedef struct posix_header
Expand Down Expand Up @@ -146,3 +140,80 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]
if end_offset == -1:
return None
return ValidChunk(start_offset=start_offset, end_offset=end_offset)


class TarUstarHandler(_TarHandler):
PATTERNS = [
HexString("75 73 74 61 72 20 20 00"),
HexString("75 73 74 61 72 00 30 30"),
]

# Since the magic is at 257, we have to subtract that from the match offset
# to get to the start of the file.
PATTERN_MATCH_OFFSET = -MAGIC_OFFSET


def _re_frame(regexp: str):
"""Wrap regexp to ensure its integrity from concatenation.
E.g.: when the regex
a|b
is naively appended by regex c, the result
a|bc
will not match "ac", while
(a|b)c
will match "ac" as intended.
"""
return f"({regexp})"


def _re_alternatives(regexps):
return _re_frame("|".join(_re_frame(regexp) for regexp in regexps))


def _padded_field(re_content_char, size, leftpad_re=" ", rightpad_re=r"[ \0x00]"):
field_regexes = []

for padsize in range(size):
content_re = f"{re_content_char}{{{size-padsize}}}"

for leftpadsize in range(padsize + 1):
rightpadsize = padsize - leftpadsize

left_re = f"{leftpad_re}{{{leftpadsize}}}" if leftpadsize else ""
right_re = f"{rightpad_re}{{{rightpadsize}}}" if rightpadsize else ""

field_regexes.append(f"{left_re}{content_re}{right_re}")

return _re_alternatives(field_regexes)


class TarUnixHandler(_TarHandler):
PATTERNS = [
Regex(
r""
# (pattern would be too big) char name[100]
+ _padded_field(r"[0-7]", 8) # char mode[8]
+ _padded_field(r"[0-7]", 8) # char uid[8]
+ _padded_field(r"[0-7]", 8) # char gid[8]
+ _padded_field(r"[0-7]", 12) # char size[12]
+ _padded_field(r"[0-7]", 12) # char mtime[12]
+ _padded_field(r"[0-7]", 8) # char chksum[8]
+ r"[0-7\x00]" # char typeflag[1] - no extensions
# Extending/dropping typeflag pattern would cover all tar formats,
# r"[0-7xgA-Z\x00]" would probably match all current major implementations.
# Info on the values for typeflag:
# - https://en.wikipedia.org/wiki/Tar_(computing)
# - https://www.gnu.org/software/tar/manual/html_node/Standard.html
# - https://github.com/openbsd/src/blob/master/bin/pax/tar.h
# - https://codebrowser.dev/glibc/glibc/posix/tar.h.html
# - https://www.ibm.com/docs/el/aix/7.2?topic=files-tarh-file
# Values 'A'-'Z' are reserved for custom implementations.
# All other values are reserved for future POSIX.1 revisions.
# Several places mention custom extensions and how they extract it,
# e.g. the IBM link above is quite explicit.
# Since its possible values are somewhat vague,
# it might be better still to not include this field in the pattern at all.
),
]
PATTERN_MATCH_OFFSET = -100 # go back to beginning of skipped name

0 comments on commit 96a4aff

Please sign in to comment.