Skip to content

Commit

Permalink
Merge pull request #655 from onekey-sec/654-tar-handler
Browse files Browse the repository at this point in the history
fix(handlers): add support for unix-compatible (aka v7) tar files.
  • Loading branch information
qkaiser authored Oct 23, 2023
2 parents aee811b + 0166d74 commit ed3e303
Show file tree
Hide file tree
Showing 9 changed files with 205 additions and 14 deletions.
80 changes: 77 additions & 3 deletions tests/handlers/archive/test_tar.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
from helpers import unhex

from unblob.file_utils import File
from unblob.handlers.archive.tar import TarHandler, _get_tar_end_offset
from unblob.handlers.archive.tar import (
TarUnixHandler,
TarUstarHandler,
_get_tar_end_offset,
)

GNU_TAR_CONTENTS = unhex(
"""\
Expand Down Expand Up @@ -120,6 +124,58 @@
"""
)

UNIX_TAR_CONTENT = unhex(
"""\
00000000 66 72 75 69 74 73 2f 00 00 00 00 00 00 00 00 00 |fruits/.........|
00000010 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000060 00 00 00 00 30 30 30 30 37 37 35 00 30 30 30 31 |....0000775.0001|
00000070 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000|
00000080 30 30 30 30 30 30 30 00 31 34 35 30 34 32 36 32 |0000000.14504262|
00000090 30 37 37 00 30 30 37 34 30 34 00 20 35 00 00 00 |077.007404. 5...|
000000a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000140 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000|
00000150 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........|
00000160 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000200 66 72 75 69 74 73 2f 61 70 70 6c 65 2e 74 78 74 |fruits/apple.txt|
00000210 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000260 00 00 00 00 30 30 30 30 36 36 34 00 30 30 30 31 |....0000664.0001|
00000270 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000|
00000280 30 30 30 30 30 30 36 00 31 34 35 30 34 32 36 32 |0000006.14504262|
00000290 30 37 31 00 30 31 31 31 35 34 00 20 00 00 00 00 |071.011154. ....|
000002a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000340 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000|
00000350 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........|
00000360 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000400 61 70 70 6c 65 0a 00 00 00 00 00 00 00 00 00 00 |apple...........|
00000410 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000600 66 72 75 69 74 73 2f 63 68 65 72 72 79 2e 74 78 |fruits/cherry.tx|
00000610 74 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |t...............|
00000620 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000660 00 00 00 00 30 30 30 30 36 36 34 00 30 30 30 31 |....0000664.0001|
00000670 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000|
00000680 30 30 30 30 30 30 37 00 31 34 35 30 34 32 36 32 |0000007.14504262|
00000690 30 37 37 00 30 31 31 33 35 36 00 20 00 00 00 00 |077.011356. ....|
000006a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000740 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000|
00000750 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........|
00000760 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000800 63 68 65 72 72 79 0a 00 00 00 00 00 00 00 00 00 |cherry..........|
00000810 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00002800
"""
)

PADDING_TO_DEFAULT_BLOCKING_FACTOR = unhex(
"""\
00000400 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
Expand Down Expand Up @@ -303,12 +359,30 @@ def test_different_blocking_factor():
pytest.param(b"some prefix ", id="nonzero-prefix"),
],
)
def test_calculate_chunk(prefix):
def test_calculate_chunk_ustar(prefix):
tar_file = File.from_bytes(prefix + GNU_TAR_CONTENTS)
handler = TarHandler()
handler = TarUstarHandler()

chunk = handler.calculate_chunk(tar_file, len(prefix))

assert chunk is not None
assert chunk.start_offset == len(prefix)
assert chunk.end_offset == len(prefix) + len(GNU_TAR_CONTENTS)


@pytest.mark.parametrize(
"prefix",
[
pytest.param(b"", id="zero-prefix"),
pytest.param(b"some prefix ", id="nonzero-prefix"),
],
)
def test_calculate_chunk_unix(prefix):
tar_file = File.from_bytes(prefix + UNIX_TAR_CONTENT)
handler = TarUnixHandler()

chunk = handler.calculate_chunk(tar_file, len(prefix))

assert chunk is not None
assert chunk.start_offset == len(prefix)
assert chunk.end_offset == len(prefix) + len(UNIX_TAR_CONTENT)
3 changes: 3 additions & 0 deletions tests/integration/archive/tar/__input__/cherry.v7.tar
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
1 change: 0 additions & 1 deletion unblob/extractors/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def _make_extract_command(self, inpath: Path, outdir: Path):
raise InvalidCommandTemplate("Invalid template placeholder", t) from k
except ValueError as v:
raise InvalidCommandTemplate("The template is malformed", t) from v

return args

def get_dependencies(self) -> List[str]:
Expand Down
3 changes: 2 additions & 1 deletion unblob/handlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@
arc.ARCHandler,
arj.ARJHandler,
cab.CABHandler,
tar.TarHandler,
tar.TarUstarHandler,
tar.TarUnixHandler,
cpio.PortableASCIIHandler,
cpio.PortableASCIIWithCRCHandler,
cpio.PortableOldASCIIHandler,
Expand Down
120 changes: 111 additions & 9 deletions unblob/handlers/archive/tar.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
ExtractResult,
File,
HexString,
Regex,
StructHandler,
ValidChunk,
)
Expand Down Expand Up @@ -98,17 +99,10 @@ def extract(self, inpath: Path, outdir: Path):
return ExtractResult(reports=tarfile.reports)


class TarHandler(StructHandler):
class _TarHandler(StructHandler):
NAME = "tar"

PATTERNS = [
HexString("75 73 74 61 72 20 20 00"),
HexString("75 73 74 61 72 00 30 30"),
]

# Since the magic is at 257, we have to subtract that from the match offset
# to get to the start of the file.
PATTERN_MATCH_OFFSET = -MAGIC_OFFSET
PATTERNS = []

C_DEFINITIONS = r"""
typedef struct posix_header
Expand Down Expand Up @@ -142,7 +136,115 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]
header_size = snull(header.size)
decode_int(header_size, 8)

def signed_sum(octets) -> int:
return sum(b if b < 128 else 256 - b for b in octets)

if header.chksum[6:8] not in (b"\x00 ", b" \x00"):
logger.error(
"Invalid checksum format",
actual_last_2_bytes=header.chksum[6:8],
handler=self.NAME,
)
return None
checksum = decode_int(header.chksum[:6], 8)
header_bytes_for_checksum = (
file[start_offset : start_offset + 148]
+ b" " * 8 # chksum field is replaced with "blanks"
+ file[start_offset + 156 : start_offset + 257]
)
extended_header_bytes = file[start_offset + 257 : start_offset + 500]
calculated_checksum_unsigned = sum(header_bytes_for_checksum)
calculated_checksum_signed = signed_sum(header_bytes_for_checksum)
checksums = (
calculated_checksum_unsigned,
calculated_checksum_unsigned + sum(extended_header_bytes),
# signed is of historical interest, calculating for the extended header is not needed
calculated_checksum_signed,
)
if checksum not in checksums:
logger.error(
"Tar header checksum mismatch", expected=str(checksum), actual=checksums
)
return None

end_offset = _get_tar_end_offset(file, start_offset)
if end_offset == -1:
return None
return ValidChunk(start_offset=start_offset, end_offset=end_offset)


class TarUstarHandler(_TarHandler):
PATTERNS = [
HexString("75 73 74 61 72 20 20 00"),
HexString("75 73 74 61 72 00 30 30"),
]

# Since the magic is at 257, we have to subtract that from the match offset
# to get to the start of the file.
PATTERN_MATCH_OFFSET = -MAGIC_OFFSET


def _re_frame(regexp: str):
"""Wrap regexp to ensure its integrity from concatenation.
E.g.: when the regex
a|b
is naively appended by regex c, the result
a|bc
will not match "ac", while
(a|b)c
will match "ac" as intended.
"""
return f"({regexp})"


def _re_alternatives(regexps):
return _re_frame("|".join(_re_frame(regexp) for regexp in regexps))


def _padded_field(re_content_char, size, leftpad_re=" ", rightpad_re=r"[ \0x00]"):
field_regexes = []

for padsize in range(size):
content_re = f"{re_content_char}{{{size-padsize}}}"

for leftpadsize in range(padsize + 1):
rightpadsize = padsize - leftpadsize

left_re = f"{leftpad_re}{{{leftpadsize}}}" if leftpadsize else ""
right_re = f"{rightpad_re}{{{rightpadsize}}}" if rightpadsize else ""

field_regexes.append(f"{left_re}{content_re}{right_re}")

return _re_alternatives(field_regexes)


class TarUnixHandler(_TarHandler):
PATTERNS = [
Regex(
r""
# (pattern would be too big) char name[100]
+ _padded_field(r"[0-7]", 8) # char mode[8]
+ _padded_field(r"[0-7]", 8) # char uid[8]
+ _padded_field(r"[0-7]", 8) # char gid[8]
+ _padded_field(r"[0-7]", 12) # char size[12]
+ _padded_field(r"[0-7]", 12) # char mtime[12]
+ _padded_field(r"[0-7]", 8) # char chksum[8]
+ r"[0-7\x00]" # char typeflag[1] - no extensions
# Extending/dropping typeflag pattern would cover all tar formats,
# r"[0-7xgA-Z\x00]" would probably match all current major implementations.
# Info on the values for typeflag:
# - https://en.wikipedia.org/wiki/Tar_(computing)
# - https://www.gnu.org/software/tar/manual/html_node/Standard.html
# - https://github.com/openbsd/src/blob/master/bin/pax/tar.h
# - https://codebrowser.dev/glibc/glibc/posix/tar.h.html
# - https://www.ibm.com/docs/el/aix/7.2?topic=files-tarh-file
# Values 'A'-'Z' are reserved for custom implementations.
# All other values are reserved for future POSIX.1 revisions.
# Several places mention custom extensions and how they extract it,
# e.g. the IBM link above is quite explicit.
# Since its possible values are somewhat vague,
# it might be better still to not include this field in the pattern at all.
),
]
PATTERN_MATCH_OFFSET = -100 # go back to beginning of skipped name

0 comments on commit ed3e303

Please sign in to comment.