From 96a4aff117d5035a6519f267de01ad0d4783d44b Mon Sep 17 00:00:00 2001 From: Quentin Kaiser Date: Mon, 25 Sep 2023 13:10:50 +0200 Subject: [PATCH 1/2] fix(handlers): add support for unix-compatible (aka v7) tar files. v7 tar headers do not have the 'ustar' magic that we match on for modern tar files. In order to match on those v7 archive, we build a regular expression that matches on mode, uid, gid, mtime, size files given their properties: - fixed size (e.g. 8 for mode) - optionally prepended by whitespaces - suffixed by null bytes (null terminated) - ASCII encoded octal digits (0x30 to 0x37) In order to build a pattern that can be handled by hyperscan without using a notation such as '[\w]{1,99}' for path name (see below for detailed explanation), we rely on utility function to build a regular expression to match all possible combination using the or (|) operator. Note: hyperscan will yield a "Pattern is too large" exception when trying to use '{1,99}' notation. Even though we found out that using '.*' works, it would have an important performance impact on pattern matching. That's why we decided to go with the OR operator approach with combination. See https://intel.github.io/hyperscan/dev-reference/compilation.html for more information about this. --- tests/handlers/archive/test_tar.py | 80 ++++++++++++++++- .../archive/tar/__input__/cherry.v7.tar | 3 + .../cherry.v7.tar_extract/fruits/cherry1.txt | 3 + .../cherry.v7.tar_extract/fruits/cherry2.txt | 3 + .../cherry.v7.tar_extract/fruits/cherry3.txt | 3 + .../cherry.v7.tar_extract/fruits/cherry4.txt | 3 + unblob/extractors/command.py | 1 - unblob/handlers/__init__.py | 3 +- unblob/handlers/archive/tar.py | 89 +++++++++++++++++-- 9 files changed, 174 insertions(+), 14 deletions(-) create mode 100644 tests/integration/archive/tar/__input__/cherry.v7.tar create mode 100644 tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry1.txt create mode 100644 tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry2.txt create mode 100644 tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry3.txt create mode 100644 tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry4.txt diff --git a/tests/handlers/archive/test_tar.py b/tests/handlers/archive/test_tar.py index cefa8706f8..9ed985361c 100644 --- a/tests/handlers/archive/test_tar.py +++ b/tests/handlers/archive/test_tar.py @@ -2,7 +2,11 @@ from helpers import unhex from unblob.file_utils import File -from unblob.handlers.archive.tar import TarHandler, _get_tar_end_offset +from unblob.handlers.archive.tar import ( + TarUnixHandler, + TarUstarHandler, + _get_tar_end_offset, +) GNU_TAR_CONTENTS = unhex( """\ @@ -120,6 +124,58 @@ """ ) +UNIX_TAR_CONTENT = unhex( + """\ +00000000 66 72 75 69 74 73 2f 00 00 00 00 00 00 00 00 00 |fruits/.........| +00000010 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000060 00 00 00 00 30 30 30 30 37 37 35 00 30 30 30 31 |....0000775.0001| +00000070 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000| +00000080 30 30 30 30 30 30 30 00 31 34 35 30 34 32 36 32 |0000000.14504262| +00000090 30 37 37 00 30 30 37 34 30 34 00 20 35 00 00 00 |077.007404. 5...| +000000a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000140 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000| +00000150 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........| +00000160 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000200 66 72 75 69 74 73 2f 61 70 70 6c 65 2e 74 78 74 |fruits/apple.txt| +00000210 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000260 00 00 00 00 30 30 30 30 36 36 34 00 30 30 30 31 |....0000664.0001| +00000270 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000| +00000280 30 30 30 30 30 30 36 00 31 34 35 30 34 32 36 32 |0000006.14504262| +00000290 30 37 31 00 30 31 31 31 35 34 00 20 00 00 00 00 |071.011154. ....| +000002a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000340 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000| +00000350 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........| +00000360 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000400 61 70 70 6c 65 0a 00 00 00 00 00 00 00 00 00 00 |apple...........| +00000410 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000600 66 72 75 69 74 73 2f 63 68 65 72 72 79 2e 74 78 |fruits/cherry.tx| +00000610 74 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |t...............| +00000620 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000660 00 00 00 00 30 30 30 30 36 36 34 00 30 30 30 31 |....0000664.0001| +00000670 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000| +00000680 30 30 30 30 30 30 37 00 31 34 35 30 34 32 36 32 |0000007.14504262| +00000690 30 37 37 00 30 31 31 33 35 36 00 20 00 00 00 00 |077.011356. ....| +000006a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000740 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000| +00000750 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........| +00000760 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000800 63 68 65 72 72 79 0a 00 00 00 00 00 00 00 00 00 |cherry..........| +00000810 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00002800 +""" +) + PADDING_TO_DEFAULT_BLOCKING_FACTOR = unhex( """\ 00000400 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| @@ -303,12 +359,30 @@ def test_different_blocking_factor(): pytest.param(b"some prefix ", id="nonzero-prefix"), ], ) -def test_calculate_chunk(prefix): +def test_calculate_chunk_ustar(prefix): tar_file = File.from_bytes(prefix + GNU_TAR_CONTENTS) - handler = TarHandler() + handler = TarUstarHandler() chunk = handler.calculate_chunk(tar_file, len(prefix)) assert chunk is not None assert chunk.start_offset == len(prefix) assert chunk.end_offset == len(prefix) + len(GNU_TAR_CONTENTS) + + +@pytest.mark.parametrize( + "prefix", + [ + pytest.param(b"", id="zero-prefix"), + pytest.param(b"some prefix ", id="nonzero-prefix"), + ], +) +def test_calculate_chunk_unix(prefix): + tar_file = File.from_bytes(prefix + UNIX_TAR_CONTENT) + handler = TarUnixHandler() + + chunk = handler.calculate_chunk(tar_file, len(prefix)) + + assert chunk is not None + assert chunk.start_offset == len(prefix) + assert chunk.end_offset == len(prefix) + len(UNIX_TAR_CONTENT) diff --git a/tests/integration/archive/tar/__input__/cherry.v7.tar b/tests/integration/archive/tar/__input__/cherry.v7.tar new file mode 100644 index 0000000000..2252cc0d91 --- /dev/null +++ b/tests/integration/archive/tar/__input__/cherry.v7.tar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f799945e335bcd22cae7f2a53033781b8a181a266a12bd04d6ab3ed3f5ba1fd2 +size 10240 diff --git a/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry1.txt b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry1.txt new file mode 100644 index 0000000000..16555f8308 --- /dev/null +++ b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry1.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7592083f2355ad7e207557efabb3594bf62c9e39677298e8265766a37d835c39 +size 8 diff --git a/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry2.txt b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry2.txt new file mode 100644 index 0000000000..19462226ea --- /dev/null +++ b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry2.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb6f92894363eceff37d58287d2b8b37bb17a00b320312ed59924a8ec07004a6 +size 8 diff --git a/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry3.txt b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry3.txt new file mode 100644 index 0000000000..5d1ed100f4 --- /dev/null +++ b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry3.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4d09282b5ac47cc58aa5dc4fe3e8d6829ac737adfe49ee32efee9cf4bf6cdf3 +size 8 diff --git a/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry4.txt b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry4.txt new file mode 100644 index 0000000000..328350b1a3 --- /dev/null +++ b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry4.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7843a44cca6d57497113ed505d027b9f5ffac78f1de7809f81ae9b314d943e79 +size 8 diff --git a/unblob/extractors/command.py b/unblob/extractors/command.py index 21647ded83..77e80029c8 100644 --- a/unblob/extractors/command.py +++ b/unblob/extractors/command.py @@ -96,7 +96,6 @@ def _make_extract_command(self, inpath: Path, outdir: Path): raise InvalidCommandTemplate("Invalid template placeholder", t) from k except ValueError as v: raise InvalidCommandTemplate("The template is malformed", t) from v - return args def get_dependencies(self) -> List[str]: diff --git a/unblob/handlers/__init__.py b/unblob/handlers/__init__.py index d06fe54061..6b60d73293 100644 --- a/unblob/handlers/__init__.py +++ b/unblob/handlers/__init__.py @@ -70,7 +70,8 @@ arc.ARCHandler, arj.ARJHandler, cab.CABHandler, - tar.TarHandler, + tar.TarUstarHandler, + tar.TarUnixHandler, cpio.PortableASCIIHandler, cpio.PortableASCIIWithCRCHandler, cpio.PortableOldASCIIHandler, diff --git a/unblob/handlers/archive/tar.py b/unblob/handlers/archive/tar.py index 92d3a76309..1cdb2920ff 100644 --- a/unblob/handlers/archive/tar.py +++ b/unblob/handlers/archive/tar.py @@ -12,6 +12,7 @@ ExtractResult, File, HexString, + Regex, StructHandler, ValidChunk, ) @@ -98,17 +99,10 @@ def extract(self, inpath: Path, outdir: Path): return ExtractResult(reports=tarfile.reports) -class TarHandler(StructHandler): +class _TarHandler(StructHandler): NAME = "tar" - PATTERNS = [ - HexString("75 73 74 61 72 20 20 00"), - HexString("75 73 74 61 72 00 30 30"), - ] - - # Since the magic is at 257, we have to subtract that from the match offset - # to get to the start of the file. - PATTERN_MATCH_OFFSET = -MAGIC_OFFSET + PATTERNS = [] C_DEFINITIONS = r""" typedef struct posix_header @@ -146,3 +140,80 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] if end_offset == -1: return None return ValidChunk(start_offset=start_offset, end_offset=end_offset) + + +class TarUstarHandler(_TarHandler): + PATTERNS = [ + HexString("75 73 74 61 72 20 20 00"), + HexString("75 73 74 61 72 00 30 30"), + ] + + # Since the magic is at 257, we have to subtract that from the match offset + # to get to the start of the file. + PATTERN_MATCH_OFFSET = -MAGIC_OFFSET + + +def _re_frame(regexp: str): + """Wrap regexp to ensure its integrity from concatenation. + + E.g.: when the regex + a|b + is naively appended by regex c, the result + a|bc + will not match "ac", while + (a|b)c + will match "ac" as intended. + """ + return f"({regexp})" + + +def _re_alternatives(regexps): + return _re_frame("|".join(_re_frame(regexp) for regexp in regexps)) + + +def _padded_field(re_content_char, size, leftpad_re=" ", rightpad_re=r"[ \0x00]"): + field_regexes = [] + + for padsize in range(size): + content_re = f"{re_content_char}{{{size-padsize}}}" + + for leftpadsize in range(padsize + 1): + rightpadsize = padsize - leftpadsize + + left_re = f"{leftpad_re}{{{leftpadsize}}}" if leftpadsize else "" + right_re = f"{rightpad_re}{{{rightpadsize}}}" if rightpadsize else "" + + field_regexes.append(f"{left_re}{content_re}{right_re}") + + return _re_alternatives(field_regexes) + + +class TarUnixHandler(_TarHandler): + PATTERNS = [ + Regex( + r"" + # (pattern would be too big) char name[100] + + _padded_field(r"[0-7]", 8) # char mode[8] + + _padded_field(r"[0-7]", 8) # char uid[8] + + _padded_field(r"[0-7]", 8) # char gid[8] + + _padded_field(r"[0-7]", 12) # char size[12] + + _padded_field(r"[0-7]", 12) # char mtime[12] + + _padded_field(r"[0-7]", 8) # char chksum[8] + + r"[0-7\x00]" # char typeflag[1] - no extensions + # Extending/dropping typeflag pattern would cover all tar formats, + # r"[0-7xgA-Z\x00]" would probably match all current major implementations. + # Info on the values for typeflag: + # - https://en.wikipedia.org/wiki/Tar_(computing) + # - https://www.gnu.org/software/tar/manual/html_node/Standard.html + # - https://github.com/openbsd/src/blob/master/bin/pax/tar.h + # - https://codebrowser.dev/glibc/glibc/posix/tar.h.html + # - https://www.ibm.com/docs/el/aix/7.2?topic=files-tarh-file + # Values 'A'-'Z' are reserved for custom implementations. + # All other values are reserved for future POSIX.1 revisions. + # Several places mention custom extensions and how they extract it, + # e.g. the IBM link above is quite explicit. + # Since its possible values are somewhat vague, + # it might be better still to not include this field in the pattern at all. + ), + ] + PATTERN_MATCH_OFFSET = -100 # go back to beginning of skipped name From 0166d74309d3cf173d8432bbf7dbee06dad25704 Mon Sep 17 00:00:00 2001 From: Krisztian Fekete <1246751+e3krisztian@users.noreply.github.com> Date: Thu, 28 Sep 2023 22:37:30 +0200 Subject: [PATCH 2/2] feat(handlers): check header checksum in tar handler The unix v7 old-style tar handler's pattern is not strict enough to prevent false positives, so checking the checksum might prevent these false matches. The header chksum is an octal representation of the sum of header bytes as (unsigned) integers (the chksum field is calculated with 8 spaces), followed by a null and a space (there are tar files with these bytes reversed). Multiple header checksums are calculated, as the old header is much shorter, than the newer headers. Wikipedia also mentions some historic implementations using signed sums. The potential match is discarded if the header checksum is not one of the calculated checksums. --- unblob/handlers/archive/tar.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/unblob/handlers/archive/tar.py b/unblob/handlers/archive/tar.py index 1cdb2920ff..876b5c1a6d 100644 --- a/unblob/handlers/archive/tar.py +++ b/unblob/handlers/archive/tar.py @@ -136,6 +136,37 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] header_size = snull(header.size) decode_int(header_size, 8) + def signed_sum(octets) -> int: + return sum(b if b < 128 else 256 - b for b in octets) + + if header.chksum[6:8] not in (b"\x00 ", b" \x00"): + logger.error( + "Invalid checksum format", + actual_last_2_bytes=header.chksum[6:8], + handler=self.NAME, + ) + return None + checksum = decode_int(header.chksum[:6], 8) + header_bytes_for_checksum = ( + file[start_offset : start_offset + 148] + + b" " * 8 # chksum field is replaced with "blanks" + + file[start_offset + 156 : start_offset + 257] + ) + extended_header_bytes = file[start_offset + 257 : start_offset + 500] + calculated_checksum_unsigned = sum(header_bytes_for_checksum) + calculated_checksum_signed = signed_sum(header_bytes_for_checksum) + checksums = ( + calculated_checksum_unsigned, + calculated_checksum_unsigned + sum(extended_header_bytes), + # signed is of historical interest, calculating for the extended header is not needed + calculated_checksum_signed, + ) + if checksum not in checksums: + logger.error( + "Tar header checksum mismatch", expected=str(checksum), actual=checksums + ) + return None + end_offset = _get_tar_end_offset(file, start_offset) if end_offset == -1: return None