From 0166d74309d3cf173d8432bbf7dbee06dad25704 Mon Sep 17 00:00:00 2001 From: Krisztian Fekete <1246751+e3krisztian@users.noreply.github.com> Date: Thu, 28 Sep 2023 22:37:30 +0200 Subject: [PATCH] feat(handlers): check header checksum in tar handler The unix v7 old-style tar handler's pattern is not strict enough to prevent false positives, so checking the checksum might prevent these false matches. The header chksum is an octal representation of the sum of header bytes as (unsigned) integers (the chksum field is calculated with 8 spaces), followed by a null and a space (there are tar files with these bytes reversed). Multiple header checksums are calculated, as the old header is much shorter, than the newer headers. Wikipedia also mentions some historic implementations using signed sums. The potential match is discarded if the header checksum is not one of the calculated checksums. --- unblob/handlers/archive/tar.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/unblob/handlers/archive/tar.py b/unblob/handlers/archive/tar.py index 1cdb2920ff..876b5c1a6d 100644 --- a/unblob/handlers/archive/tar.py +++ b/unblob/handlers/archive/tar.py @@ -136,6 +136,37 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] header_size = snull(header.size) decode_int(header_size, 8) + def signed_sum(octets) -> int: + return sum(b if b < 128 else 256 - b for b in octets) + + if header.chksum[6:8] not in (b"\x00 ", b" \x00"): + logger.error( + "Invalid checksum format", + actual_last_2_bytes=header.chksum[6:8], + handler=self.NAME, + ) + return None + checksum = decode_int(header.chksum[:6], 8) + header_bytes_for_checksum = ( + file[start_offset : start_offset + 148] + + b" " * 8 # chksum field is replaced with "blanks" + + file[start_offset + 156 : start_offset + 257] + ) + extended_header_bytes = file[start_offset + 257 : start_offset + 500] + calculated_checksum_unsigned = sum(header_bytes_for_checksum) + calculated_checksum_signed = signed_sum(header_bytes_for_checksum) + checksums = ( + calculated_checksum_unsigned, + calculated_checksum_unsigned + sum(extended_header_bytes), + # signed is of historical interest, calculating for the extended header is not needed + calculated_checksum_signed, + ) + if checksum not in checksums: + logger.error( + "Tar header checksum mismatch", expected=str(checksum), actual=checksums + ) + return None + end_offset = _get_tar_end_offset(file, start_offset) if end_offset == -1: return None