diff --git a/tests/integration/compression/gzip/__input__/multi-volume-digit-hash.tar b/tests/integration/compression/gzip/__input__/multi-volume-digit-hash.tar new file mode 100644 index 0000000000..7c42d73af9 --- /dev/null +++ b/tests/integration/compression/gzip/__input__/multi-volume-digit-hash.tar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a10c7419ea665deb88d4f7bd0cc7685e36e071562d8ce047feb91a9d993c9c87 +size 10240 diff --git a/tests/integration/compression/gzip/__input__/multi-volume-digit.tar b/tests/integration/compression/gzip/__input__/multi-volume-digit.tar new file mode 100644 index 0000000000..8e06622d05 --- /dev/null +++ b/tests/integration/compression/gzip/__input__/multi-volume-digit.tar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b916eb8cddce9d9f8e9b8ac9d3ea48b4fd73249dc86bf71ea59f83f6dda931 +size 10240 diff --git a/tests/integration/compression/gzip/__input__/multi-volume-split-then-gzip.tar b/tests/integration/compression/gzip/__input__/multi-volume-split-then-gzip.tar new file mode 100644 index 0000000000..fbd8d5f19b --- /dev/null +++ b/tests/integration/compression/gzip/__input__/multi-volume-split-then-gzip.tar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a329453dfdaf817fa1ff384227575bedc18a5d8ca6c014132dc77ac6849be2d +size 10240 diff --git a/tests/integration/compression/gzip/__input__/multi-volume.tar b/tests/integration/compression/gzip/__input__/multi-volume.tar new file mode 100644 index 0000000000..2f252aa65f --- /dev/null +++ b/tests/integration/compression/gzip/__input__/multi-volume.tar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d605c55b80ae5c57f6a8268d17a1d41d816dfeda7829ba8a488d62199e7c891 +size 10240 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.01 b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.01 new file mode 100644 index 0000000000..e94a5d1aab --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.01 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:727525bd9f74dfc7046a7767011b2952bee7031638b1d2fa01c7830beec8f200 +size 50 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.02 b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.02 new file mode 100644 index 0000000000..c1dbca68e6 --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.02 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bec36068029bb2e2158c0030e062b5350ea23e9acd8ae3f45256057b481401b +size 10 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.md5 b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.md5 new file mode 100644 index 0000000000..b6b445168d --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.md5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c235d418bc607a12389cb89381279d33e56126847512cc05611333d653464345 +size 33 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz_extract/one.txt b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz_extract/one.txt new file mode 100644 index 0000000000..7d58ca049a --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz_extract/one.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ff0f745425182c35616510e1ed2781339350102bcd1f0167842248e0649400 +size 47 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.01 b/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.01 new file mode 100644 index 0000000000..e94a5d1aab --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.01 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:727525bd9f74dfc7046a7767011b2952bee7031638b1d2fa01c7830beec8f200 +size 50 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.02 b/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.02 new file mode 100644 index 0000000000..c1dbca68e6 --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.02 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bec36068029bb2e2158c0030e062b5350ea23e9acd8ae3f45256057b481401b +size 10 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz_extract/one.txt b/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz_extract/one.txt new file mode 100644 index 0000000000..7d58ca049a --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz_extract/one.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ff0f745425182c35616510e1ed2781339350102bcd1f0167842248e0649400 +size 47 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.aa b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.aa new file mode 100644 index 0000000000..c1031812c5 --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.aa @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef248dcb11bd9196aeff4f075ec6f10dd44f46d66c4d827a0a07f3b1ede03af6 +size 35 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ab b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ab new file mode 100644 index 0000000000..ab32f08213 --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3813261d2ae514bc31593c166c53391312d07507a5c952b9d7b059a809ec747a +size 38 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ac b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ac new file mode 100644 index 0000000000..4d08b84e4b --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ac @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f0d4cd80a4203aef29742919f572c9624660864bdb8bd87eb2050575829f386 +size 38 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz_extract/one.txt b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz_extract/one.txt new file mode 100644 index 0000000000..7296ddbe97 --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz_extract/one.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7705c86fc28729ea1f35b4453760a85b5b4e100e64b346d3dd13b9dd9da8f6f +size 21 diff --git a/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.aa b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.aa new file mode 100644 index 0000000000..e94a5d1aab --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.aa @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:727525bd9f74dfc7046a7767011b2952bee7031638b1d2fa01c7830beec8f200 +size 50 diff --git a/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.ab b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.ab new file mode 100644 index 0000000000..c1dbca68e6 --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.ab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bec36068029bb2e2158c0030e062b5350ea23e9acd8ae3f45256057b481401b +size 10 diff --git a/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz_extract/one.txt b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz_extract/one.txt new file mode 100644 index 0000000000..7d58ca049a --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz_extract/one.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ff0f745425182c35616510e1ed2781339350102bcd1f0167842248e0649400 +size 47 diff --git a/unblob/handlers/__init__.py b/unblob/handlers/__init__.py index 6b60d73293..df12eb0df7 100644 --- a/unblob/handlers/__init__.py +++ b/unblob/handlers/__init__.py @@ -101,4 +101,7 @@ engenius.EngeniusHandler, ) -BUILTIN_DIR_HANDLERS: DirectoryHandlers = (sevenzip.MultiVolumeSevenZipHandler,) +BUILTIN_DIR_HANDLERS: DirectoryHandlers = ( + sevenzip.MultiVolumeSevenZipHandler, + gzip.MultiVolumeGzipHandler, +) diff --git a/unblob/handlers/compression/gzip.py b/unblob/handlers/compression/gzip.py index f0ec499bf9..dde0ad1b79 100644 --- a/unblob/handlers/compression/gzip.py +++ b/unblob/handlers/compression/gzip.py @@ -24,10 +24,21 @@ from structlog import get_logger from unblob.extractors import Command +from unblob.extractors.command import MultiFileCommand from unblob.models import Extractor from ...file_utils import InvalidInputFormat -from ...models import File, Handler, HexString, ValidChunk +from ...models import ( + DirectoryExtractor, + DirectoryHandler, + ExtractResult, + File, + Glob, + Handler, + HexString, + MultiFile, + ValidChunk, +) from ._gzip_reader import SingleMemberGzipReader logger = get_logger() @@ -71,10 +82,22 @@ class GZIPExtractor(Extractor): def get_dependencies(self) -> List[str]: return ["7z"] - def extract(self, inpath: Path, outdir: Path): + def extract(self, inpath: Path, outdir: Path) -> Optional[ExtractResult]: name = get_gzip_embedded_name(inpath) or "gzip.uncompressed" extractor = Command("7z", "x", "-y", "{inpath}", "-so", stdout=name) - extractor.extract(inpath, outdir) + return extractor.extract(inpath, outdir) + + +class MultiGZIPExtractor(DirectoryExtractor): + def get_dependencies(self) -> List[str]: + return ["7z"] + + def extract(self, paths: List[Path], outdir: Path) -> Optional[ExtractResult]: + name = get_gzip_embedded_name(paths[0]) or "gzip.uncompressed" + extractor = MultiFileCommand( + "7z", "x", "-p", "-y", "{inpath}", "-so", stdout=name + ) + return extractor.extract(paths, outdir) class GZIPHandler(Handler): @@ -124,3 +147,41 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] start_offset=start_offset, end_offset=file.tell(), ) + + +class MultiVolumeGzipHandler(DirectoryHandler): + NAME = "multi-gzip" + EXTRACTOR = MultiGZIPExtractor() + + PATTERN = Glob("*.gz.*") + + def is_valid_gzip(self, path: Path) -> bool: + with File.from_path(path) as f: + try: + fp = SingleMemberGzipReader(f) + if not fp.read_header(): + return False + except gzip.BadGzipFile: + return False + return True + + def calculate_multifile(self, file: Path) -> Optional[MultiFile]: + paths = sorted(file.parent.glob(f"{file.stem}.*")) + + # we 'discard' paths that are not the first in the ordered list, + # otherwise we will end up with colliding reports, one for every + # path in the list. + if file != paths[0]: + return None + + if self.is_valid_gzip(file): + files_size = sum(path.stat().st_size for path in paths) + logger.debug( + "Multi-volume files", paths=paths, files_size=files_size, _verbosity=2 + ) + + return MultiFile( + name=paths[0].stem, + paths=paths, + ) + return None