diff --git a/tests/handlers/archive/test_tar.py b/tests/handlers/archive/test_tar.py index cefa8706f8..9ed985361c 100644 --- a/tests/handlers/archive/test_tar.py +++ b/tests/handlers/archive/test_tar.py @@ -2,7 +2,11 @@ from helpers import unhex from unblob.file_utils import File -from unblob.handlers.archive.tar import TarHandler, _get_tar_end_offset +from unblob.handlers.archive.tar import ( + TarUnixHandler, + TarUstarHandler, + _get_tar_end_offset, +) GNU_TAR_CONTENTS = unhex( """\ @@ -120,6 +124,58 @@ """ ) +UNIX_TAR_CONTENT = unhex( + """\ +00000000 66 72 75 69 74 73 2f 00 00 00 00 00 00 00 00 00 |fruits/.........| +00000010 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000060 00 00 00 00 30 30 30 30 37 37 35 00 30 30 30 31 |....0000775.0001| +00000070 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000| +00000080 30 30 30 30 30 30 30 00 31 34 35 30 34 32 36 32 |0000000.14504262| +00000090 30 37 37 00 30 30 37 34 30 34 00 20 35 00 00 00 |077.007404. 5...| +000000a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000140 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000| +00000150 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........| +00000160 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000200 66 72 75 69 74 73 2f 61 70 70 6c 65 2e 74 78 74 |fruits/apple.txt| +00000210 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000260 00 00 00 00 30 30 30 30 36 36 34 00 30 30 30 31 |....0000664.0001| +00000270 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000| +00000280 30 30 30 30 30 30 36 00 31 34 35 30 34 32 36 32 |0000006.14504262| +00000290 30 37 31 00 30 31 31 31 35 34 00 20 00 00 00 00 |071.011154. ....| +000002a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000340 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000| +00000350 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........| +00000360 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000400 61 70 70 6c 65 0a 00 00 00 00 00 00 00 00 00 00 |apple...........| +00000410 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000600 66 72 75 69 74 73 2f 63 68 65 72 72 79 2e 74 78 |fruits/cherry.tx| +00000610 74 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |t...............| +00000620 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000660 00 00 00 00 30 30 30 30 36 36 34 00 30 30 30 31 |....0000664.0001| +00000670 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000| +00000680 30 30 30 30 30 30 37 00 31 34 35 30 34 32 36 32 |0000007.14504262| +00000690 30 37 37 00 30 31 31 33 35 36 00 20 00 00 00 00 |077.011356. ....| +000006a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000740 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000| +00000750 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........| +00000760 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00000800 63 68 65 72 72 79 0a 00 00 00 00 00 00 00 00 00 |cherry..........| +00000810 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| +* +00002800 +""" +) + PADDING_TO_DEFAULT_BLOCKING_FACTOR = unhex( """\ 00000400 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| @@ -303,12 +359,30 @@ def test_different_blocking_factor(): pytest.param(b"some prefix ", id="nonzero-prefix"), ], ) -def test_calculate_chunk(prefix): +def test_calculate_chunk_ustar(prefix): tar_file = File.from_bytes(prefix + GNU_TAR_CONTENTS) - handler = TarHandler() + handler = TarUstarHandler() chunk = handler.calculate_chunk(tar_file, len(prefix)) assert chunk is not None assert chunk.start_offset == len(prefix) assert chunk.end_offset == len(prefix) + len(GNU_TAR_CONTENTS) + + +@pytest.mark.parametrize( + "prefix", + [ + pytest.param(b"", id="zero-prefix"), + pytest.param(b"some prefix ", id="nonzero-prefix"), + ], +) +def test_calculate_chunk_unix(prefix): + tar_file = File.from_bytes(prefix + UNIX_TAR_CONTENT) + handler = TarUnixHandler() + + chunk = handler.calculate_chunk(tar_file, len(prefix)) + + assert chunk is not None + assert chunk.start_offset == len(prefix) + assert chunk.end_offset == len(prefix) + len(UNIX_TAR_CONTENT) diff --git a/tests/integration/archive/tar/__input__/cherry.v7.tar b/tests/integration/archive/tar/__input__/cherry.v7.tar new file mode 100644 index 0000000000..2252cc0d91 --- /dev/null +++ b/tests/integration/archive/tar/__input__/cherry.v7.tar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f799945e335bcd22cae7f2a53033781b8a181a266a12bd04d6ab3ed3f5ba1fd2 +size 10240 diff --git a/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry1.txt b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry1.txt new file mode 100644 index 0000000000..16555f8308 --- /dev/null +++ b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry1.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7592083f2355ad7e207557efabb3594bf62c9e39677298e8265766a37d835c39 +size 8 diff --git a/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry2.txt b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry2.txt new file mode 100644 index 0000000000..19462226ea --- /dev/null +++ b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry2.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb6f92894363eceff37d58287d2b8b37bb17a00b320312ed59924a8ec07004a6 +size 8 diff --git a/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry3.txt b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry3.txt new file mode 100644 index 0000000000..5d1ed100f4 --- /dev/null +++ b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry3.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4d09282b5ac47cc58aa5dc4fe3e8d6829ac737adfe49ee32efee9cf4bf6cdf3 +size 8 diff --git a/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry4.txt b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry4.txt new file mode 100644 index 0000000000..328350b1a3 --- /dev/null +++ b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry4.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7843a44cca6d57497113ed505d027b9f5ffac78f1de7809f81ae9b314d943e79 +size 8 diff --git a/unblob/extractors/command.py b/unblob/extractors/command.py index 21647ded83..77e80029c8 100644 --- a/unblob/extractors/command.py +++ b/unblob/extractors/command.py @@ -96,7 +96,6 @@ def _make_extract_command(self, inpath: Path, outdir: Path): raise InvalidCommandTemplate("Invalid template placeholder", t) from k except ValueError as v: raise InvalidCommandTemplate("The template is malformed", t) from v - return args def get_dependencies(self) -> List[str]: diff --git a/unblob/handlers/__init__.py b/unblob/handlers/__init__.py index d06fe54061..6b60d73293 100644 --- a/unblob/handlers/__init__.py +++ b/unblob/handlers/__init__.py @@ -70,7 +70,8 @@ arc.ARCHandler, arj.ARJHandler, cab.CABHandler, - tar.TarHandler, + tar.TarUstarHandler, + tar.TarUnixHandler, cpio.PortableASCIIHandler, cpio.PortableASCIIWithCRCHandler, cpio.PortableOldASCIIHandler, diff --git a/unblob/handlers/archive/tar.py b/unblob/handlers/archive/tar.py index 92d3a76309..876b5c1a6d 100644 --- a/unblob/handlers/archive/tar.py +++ b/unblob/handlers/archive/tar.py @@ -12,6 +12,7 @@ ExtractResult, File, HexString, + Regex, StructHandler, ValidChunk, ) @@ -98,17 +99,10 @@ def extract(self, inpath: Path, outdir: Path): return ExtractResult(reports=tarfile.reports) -class TarHandler(StructHandler): +class _TarHandler(StructHandler): NAME = "tar" - PATTERNS = [ - HexString("75 73 74 61 72 20 20 00"), - HexString("75 73 74 61 72 00 30 30"), - ] - - # Since the magic is at 257, we have to subtract that from the match offset - # to get to the start of the file. - PATTERN_MATCH_OFFSET = -MAGIC_OFFSET + PATTERNS = [] C_DEFINITIONS = r""" typedef struct posix_header @@ -142,7 +136,115 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] header_size = snull(header.size) decode_int(header_size, 8) + def signed_sum(octets) -> int: + return sum(b if b < 128 else 256 - b for b in octets) + + if header.chksum[6:8] not in (b"\x00 ", b" \x00"): + logger.error( + "Invalid checksum format", + actual_last_2_bytes=header.chksum[6:8], + handler=self.NAME, + ) + return None + checksum = decode_int(header.chksum[:6], 8) + header_bytes_for_checksum = ( + file[start_offset : start_offset + 148] + + b" " * 8 # chksum field is replaced with "blanks" + + file[start_offset + 156 : start_offset + 257] + ) + extended_header_bytes = file[start_offset + 257 : start_offset + 500] + calculated_checksum_unsigned = sum(header_bytes_for_checksum) + calculated_checksum_signed = signed_sum(header_bytes_for_checksum) + checksums = ( + calculated_checksum_unsigned, + calculated_checksum_unsigned + sum(extended_header_bytes), + # signed is of historical interest, calculating for the extended header is not needed + calculated_checksum_signed, + ) + if checksum not in checksums: + logger.error( + "Tar header checksum mismatch", expected=str(checksum), actual=checksums + ) + return None + end_offset = _get_tar_end_offset(file, start_offset) if end_offset == -1: return None return ValidChunk(start_offset=start_offset, end_offset=end_offset) + + +class TarUstarHandler(_TarHandler): + PATTERNS = [ + HexString("75 73 74 61 72 20 20 00"), + HexString("75 73 74 61 72 00 30 30"), + ] + + # Since the magic is at 257, we have to subtract that from the match offset + # to get to the start of the file. + PATTERN_MATCH_OFFSET = -MAGIC_OFFSET + + +def _re_frame(regexp: str): + """Wrap regexp to ensure its integrity from concatenation. + + E.g.: when the regex + a|b + is naively appended by regex c, the result + a|bc + will not match "ac", while + (a|b)c + will match "ac" as intended. + """ + return f"({regexp})" + + +def _re_alternatives(regexps): + return _re_frame("|".join(_re_frame(regexp) for regexp in regexps)) + + +def _padded_field(re_content_char, size, leftpad_re=" ", rightpad_re=r"[ \0x00]"): + field_regexes = [] + + for padsize in range(size): + content_re = f"{re_content_char}{{{size-padsize}}}" + + for leftpadsize in range(padsize + 1): + rightpadsize = padsize - leftpadsize + + left_re = f"{leftpad_re}{{{leftpadsize}}}" if leftpadsize else "" + right_re = f"{rightpad_re}{{{rightpadsize}}}" if rightpadsize else "" + + field_regexes.append(f"{left_re}{content_re}{right_re}") + + return _re_alternatives(field_regexes) + + +class TarUnixHandler(_TarHandler): + PATTERNS = [ + Regex( + r"" + # (pattern would be too big) char name[100] + + _padded_field(r"[0-7]", 8) # char mode[8] + + _padded_field(r"[0-7]", 8) # char uid[8] + + _padded_field(r"[0-7]", 8) # char gid[8] + + _padded_field(r"[0-7]", 12) # char size[12] + + _padded_field(r"[0-7]", 12) # char mtime[12] + + _padded_field(r"[0-7]", 8) # char chksum[8] + + r"[0-7\x00]" # char typeflag[1] - no extensions + # Extending/dropping typeflag pattern would cover all tar formats, + # r"[0-7xgA-Z\x00]" would probably match all current major implementations. + # Info on the values for typeflag: + # - https://en.wikipedia.org/wiki/Tar_(computing) + # - https://www.gnu.org/software/tar/manual/html_node/Standard.html + # - https://github.com/openbsd/src/blob/master/bin/pax/tar.h + # - https://codebrowser.dev/glibc/glibc/posix/tar.h.html + # - https://www.ibm.com/docs/el/aix/7.2?topic=files-tarh-file + # Values 'A'-'Z' are reserved for custom implementations. + # All other values are reserved for future POSIX.1 revisions. + # Several places mention custom extensions and how they extract it, + # e.g. the IBM link above is quite explicit. + # Since its possible values are somewhat vague, + # it might be better still to not include this field in the pattern at all. + ), + ] + PATTERN_MATCH_OFFSET = -100 # go back to beginning of skipped name