From 2ffcb1bb7f2babb0cb7ef71e520bdfbbe601d0a2 Mon Sep 17 00:00:00 2001 From: Antoine Pecoraro Date: Thu, 25 Jul 2024 10:35:10 +0200 Subject: [PATCH] fix(handler): improve zip64 detection We improve the zip64 detection by trying to parse the zip64 early. If the zip file contains a zip64 header, we are sure its a zip64. If it doesn't hold a zip64 header, we are sure its not a zip64 and we fall back to zip32. In order to create the test files, we used this command ``` cat somefile.txt | zip > zip64.zip zip -F zip64.zip --out zip64-without-cd.zip ``` --- .../zip/zip64/__input__/zip64-without-cd.zip | 3 ++ .../__output__/zip64-without-cd.zip_extract/- | 3 ++ unblob/handlers/archive/zip.py | 39 +++++++++---------- 3 files changed, 25 insertions(+), 20 deletions(-) create mode 100644 tests/integration/archive/zip/zip64/__input__/zip64-without-cd.zip create mode 100644 tests/integration/archive/zip/zip64/__output__/zip64-without-cd.zip_extract/- diff --git a/tests/integration/archive/zip/zip64/__input__/zip64-without-cd.zip b/tests/integration/archive/zip/zip64/__input__/zip64-without-cd.zip new file mode 100644 index 0000000000..a536305240 --- /dev/null +++ b/tests/integration/archive/zip/zip64/__input__/zip64-without-cd.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4ea164d7b08ba3ee6dbea6808e7d5438ffbf317420e28d0c5e42b7090f42851 +size 126 diff --git a/tests/integration/archive/zip/zip64/__output__/zip64-without-cd.zip_extract/- b/tests/integration/archive/zip/zip64/__output__/zip64-without-cd.zip_extract/- new file mode 100644 index 0000000000..fb25b54492 --- /dev/null +++ b/tests/integration/archive/zip/zip64/__output__/zip64-without-cd.zip_extract/- @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:303980bcb9e9e6cdec515230791af8b0ab1aaa244b58a8d99152673aa22197d0 +size 6 diff --git a/unblob/handlers/archive/zip.py b/unblob/handlers/archive/zip.py index 0a63ec09fb..44f945e944 100644 --- a/unblob/handlers/archive/zip.py +++ b/unblob/handlers/archive/zip.py @@ -114,9 +114,9 @@ def is_zip64_eocd(end_of_central_directory): or end_of_central_directory.offset_of_cd == 0xFFFFFFFF ) - @staticmethod - def is_zip64_cd_file(file_header): + def has_zip64_tag(self, file): # see https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT section 4.3.9.2 + file_header = self.cparser_le.partial_cd_file_header_t(file) return ( file_header.file_size == 0xFFFFFFFF or file_header.compress_size == 0xFFFFFFFF @@ -144,23 +144,23 @@ def _parse_zip64(self, file: File, start_offset: int, offset: int): "Missing ZIP64 EOCD header record header in ZIP chunk." ) return zip64_eocd - raise InvalidInputFormat( - "Missing ZIP64 EOCD locator record header in ZIP chunk." - ) + return None + + def get_zip64_eocd(self, file, start_offset, offset, end_of_central_directory): + # some values in the CD can be FFFF, indicating its a zip64 + # if the offset of the CD is 0xFFFFFFFF, its definitely one + # otherwise we check every other header indicating zip64 + if self.is_zip64_eocd(end_of_central_directory): + return self._parse_zip64(file, start_offset, offset) - def is_zip64(self, file, start_offset, offset, end_of_central_directory): absolute_offset_of_cd = start_offset + end_of_central_directory.offset_of_cd if 0 < absolute_offset_of_cd < offset: file.seek(absolute_offset_of_cd, io.SEEK_SET) - file_header = self.cparser_le.partial_cd_file_header_t(file) - if self.is_zip64_cd_file(file_header): - return True + if self.has_zip64_tag(file): + return self._parse_zip64(file, start_offset, offset) - # some values in the CD can be FFFF, indicating its a zip64 - # if the offset of the CD is 0xFFFFFFFF, its definitely one - # otherwise we check every other header indicating zip64 - return self.is_zip64_eocd(end_of_central_directory) + return None def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]: has_encrypted_files = False @@ -173,9 +173,11 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] file.seek(offset, io.SEEK_SET) end_of_central_directory = self.parse_header(file) - if self.is_zip64(file, start_offset, offset, end_of_central_directory): - file.seek(offset, io.SEEK_SET) - end_of_central_directory = self._parse_zip64(file, start_offset, offset) + zip64_eocd = self.get_zip64_eocd( + file, start_offset, offset, end_of_central_directory + ) + if zip64_eocd is not None: + end_of_central_directory = zip64_eocd break # the EOCD offset is equal to the offset of CD + size of CD @@ -188,10 +190,7 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] if offset == end_of_central_directory_offset: break else: - if offset is None: - raise InvalidInputFormat("Missing EOCD record header in ZIP chunk.") - # if we can't find a valid 32bit ZIP EOCD, we fall back to ZIP64 - end_of_central_directory = self._parse_zip64(file, start_offset, offset) + raise InvalidInputFormat("Missing EOCD record header in ZIP chunk.") has_encrypted_files = self.has_encrypted_files( file, start_offset, end_of_central_directory