From 8e2e11b322cb7ce3e121acb9dc7e11ad6b69ae7c Mon Sep 17 00:00:00 2001 From: Quentin Kaiser Date: Sun, 27 Oct 2024 12:47:33 +0100 Subject: [PATCH] =?UTF-8?q?feat(math):=20add=20=CF=87=C2=B2=20probability?= =?UTF-8?q?=20and=20convert=20EntropyReport=20to=20RandomnessReport?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce another randomness measure based on Chi Square probability by using unblob-native's chi_square_probability function. This function returns the Chi Square distribution probability. Chi-square tests are effective for distinguishing compressed from encrypted data because they evaluate the uniformity of byte distributions more rigorously than Shannon entropy. In compressed files, bytes often cluster around certain values due to patterns that still exist (albeit less detectable), resulting in a non-uniform distribution. Encrypted data, by contrast, exhibits nearly perfect uniformity, as each byte value from 0–255 is expected to appear with almost equal frequency, making it harder to detect any discernible patterns. The chi-square distribution is calculated for the stream of bytes in the chunk and expressed as an absolute number and a percentage which indicates how frequently a truly random sequence would exceed the value calculated. The percentage is the only value that is of interest from unblob's perspective, so that's why we only return it. According to ent doc⁰: > We [can] interpret the percentage as the degree to which the > sequence tested is suspected of being non-random. If the percentage is > greater than 99% or less than 1%, the sequence is almost certainly not > random. If the percentage is between 99% and 95% or between 1% and 5%, > the sequence is suspect. Percentages between 90% and 95% and 5% and 10% > indicate the sequence is “almost suspect”. [0] - https://www.fourmilab.ch/random/ This randomness measure is introduced by modifying the EntropyReport class so that it contains two RandomnessMeasurements: - shannon: for Shannon entropy, which was already there - chi_square: for Chi Square probability, which we introduce EntropyReport is renamed to RandomnessReport to reflect that all measurements are not entropy related. The format_entropy_plot has been adjusted to display two lines within the entropy graph. One for Shannon, the other for Chi Square. This commit breaks the previous API by converting entropy_depth and entropy_plot to randomness_depth and randomness_plot in ExtractionConfig. The '--entropy-depth' CLI option is replaced by '--randomness-depth'. --- docs/guide.md | 104 +++++++++++++++--------------- fuzzing/search_chunks_fuzzer.py | 4 +- tests/test_cleanup.py | 8 +-- tests/test_cli.py | 8 +-- tests/test_processing.py | 102 ++++++++++++++++++----------- tests/test_report.py | 10 +-- unblob/cli.py | 10 +-- unblob/models.py | 14 ++-- unblob/processing.py | 110 ++++++++++++++++++++------------ unblob/report.py | 10 ++- unblob/testing.py | 2 +- 11 files changed, 223 insertions(+), 159 deletions(-) diff --git a/docs/guide.md b/docs/guide.md index 19ec4ed850..ffb6f603e1 100644 --- a/docs/guide.md +++ b/docs/guide.md @@ -114,10 +114,10 @@ $ cat alpine-report.json ] ``` -### Entropy calculation +### Randomness calculation If you are analyzing an unknown file format, it might be useful to know the -entropy of the contained files, so you can quickly see for example whether the +randomness of the contained files, so you can quickly see for example whether the file is **encrypted** or contains some random content. Let's make a file with fully random content at the start and end: @@ -128,59 +128,61 @@ $ dd if=/dev/random of=random2.bin bs=10M count=1 $ cat random1.bin alpine-minirootfs-3.16.1-x86_64.tar.gz random2.bin > unknown-file ``` -A nice ASCII entropy plot is drawn on verbose level 3: +A nice ASCII randomness plot is drawn on verbose level 3: ```console $ unblob -vvv unknown-file | grep -C 15 "Entropy distribution" -2022-07-30 07:58.16 [debug ] Ended searching for chunks all_chunks=[0xa00000-0xc96196] pid=19803 -2022-07-30 07:58.16 [debug ] Removed inner chunks outer_chunk_count=1 pid=19803 removed_inner_chunk_count=0 -2022-07-30 07:58.16 [warning ] Found unknown Chunks chunks=[0x0-0xa00000, 0xc96196-0x1696196] pid=19803 -2022-07-30 07:58.16 [info ] Extracting unknown chunk chunk=0x0-0xa00000 path=unknown-file_extract/0-10485760.unknown pid=19803 -2022-07-30 07:58.16 [debug ] Carving chunk path=unknown-file_extract/0-10485760.unknown pid=19803 -2022-07-30 07:58.16 [debug ] Calculating entropy for file path=unknown-file_extract/0-10485760.unknown pid=19803 size=0xa00000 -2022-07-30 07:58.16 [debug ] Entropy calculated highest=99.99 lowest=99.98 mean=99.98 pid=19803 -2022-07-30 07:58.16 [warning ] Drawing plot pid=19803 -2022-07-30 07:58.16 [debug ] Entropy chart chart= - Entropy distribution - ┌---------------------------------------------------------------------------┐ -100┤•••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••│ - 90┤ │ - 80┤ │ - 70┤ │ - 60┤ │ - 50┤ │ - 40┤ │ - 30┤ │ - 20┤ │ - 10┤ │ - 0┤ │ - └┬---┬---┬---─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬┘ - 1 4 7 12 16 20 24 29 33 37 41 46 50 54 59 63 67 71 76 80 -[y] entropy % [x] mB - pid=19803 -2022-07-30 07:58.16 [info ] Extracting unknown chunk chunk=0xc96196-0x1696196 path=unknown-file_extract/13197718-23683478.unknown pid=19803 -2022-07-30 07:58.16 [debug ] Carving chunk path=unknown-file_extract/13197718-23683478.unknown pid=19803 -2022-07-30 07:58.16 [debug ] Calculating entropy for file path=unknown-file_extract/13197718-23683478.unknown pid=19803 size=0xa00000 -2022-07-30 07:58.16 [debug ] Entropy calculated highest=99.99 lowest=99.98 mean=99.98 pid=19803 -2022-07-30 07:58.16 [warning ] Drawing plot pid=19803 -2022-07-30 07:58.16 [debug ] Entropy chart chart= - Entropy distribution - ┌---------------------------------------------------------------------------┐ -100┤•••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••│ - 90┤ │ - 80┤ │ - 70┤ │ - 60┤ │ - 50┤ │ - 40┤ │ - 30┤ │ - 20┤ │ - 10┤ │ - 0┤ │ - └┬---┬---┬---─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬--─┬┘ - 1 4 7 12 16 20 24 29 33 37 41 46 50 54 59 63 67 71 76 80 -[y] entropy % [x] mB +2024-10-30 10:52.03 [debug ] Calculating chunk for pattern match handler=arc pid=1963719 real_offset=0x1685f5b start_offset=0x1685f5b +2024-10-30 10:52.03 [debug ] Header parsed header= pid=1963719 +2024-10-30 10:52.03 [debug ] Ended searching for chunks all_chunks=[0xa00000-0xc96196] pid=1963719 +2024-10-30 10:52.03 [debug ] Removed inner chunks outer_chunk_count=1 pid=1963719 removed_inner_chunk_count=0 +2024-10-30 10:52.03 [warning ] Found unknown Chunks chunks=[0x0-0xa00000, 0xc96196-0x1696196] pid=1963719 +2024-10-30 10:52.03 [info ] Extracting unknown chunk chunk=0x0-0xa00000 path=unknown-file_extract/0-10485760.unknown pid=1963719 +2024-10-30 10:52.03 [debug ] Carving chunk path=unknown-file_extract/0-10485760.unknown pid=1963719 +2024-10-30 10:52.03 [debug ] Calculating randomness for file path=unknown-file_extract/0-10485760.unknown pid=1963719 size=0xa00000 +2024-10-30 10:52.03 [debug ] Shannon entropy calculated block_size=0x20000 highest=99.99 lowest=99.98 mean=99.98 path=unknown-file_extract/0-10485760.unknown pid=1963719 size=0xa00000 +2024-10-30 10:52.03 [debug ] Chi square probability calculated block_size=0x20000 highest=97.88 lowest=3.17 mean=52.76 path=unknown-file_extract/0-10485760.unknown pid=1963719 size=0xa00000 +2024-10-30 10:52.03 [debug ] Entropy chart chart= + Randomness distribution + ┌───────────────────────────────────────────────────────────────────────────┐ +100┤ •• Shannon entropy (%) •••••••••♰••••••••••••••••••••••••••••••••••│ + 90┤ ♰♰ Chi square probability (%) ♰ ♰ ♰♰♰♰ ♰ ♰ ♰ │ + 80┤♰ ♰ ♰♰ ♰♰ ♰♰ ♰ ♰ ♰♰♰♰♰♰♰♰♰ ♰ ♰♰♰♰♰♰ ♰♰ ♰♰ │ + 70┤♰♰♰♰ ♰ ♰ ♰ ♰ ♰♰♰ ♰ ♰ ♰ ♰ ♰♰♰♰♰♰♰♰♰ ♰♰ ♰ ♰ ♰ ♰♰♰ ♰♰♰♰♰♰ │ + 60┤♰♰♰♰ ♰♰ ♰♰ ♰ ♰♰♰♰ ♰ ♰♰ ♰ ♰ ♰ ♰♰♰♰♰♰ ♰♰ ♰ ♰ ♰♰♰♰ ♰ ♰♰♰ ♰♰♰♰♰♰♰ │ + 50┤ ♰♰♰ ♰♰ ♰♰ ♰♰ ♰♰♰♰ ♰♰ ♰ ♰♰♰ ♰♰♰♰♰♰ ♰ ♰ ♰ ♰♰♰♰♰ ♰ ♰♰♰ ♰ ♰♰♰♰♰ ♰ │ + 40┤ ♰♰ ♰♰ ♰ ♰♰ ♰♰♰♰ ♰♰ ♰ ♰♰♰ ♰♰♰♰♰♰ ♰♰ ♰♰ ♰♰♰♰♰♰ ♰ ♰♰♰ ♰ ♰♰♰♰ ♰♰ ♰│ + 30┤ ♰ ♰♰ ♰♰ ♰♰♰♰ ♰ ♰♰ ♰♰ ♰♰ ♰ ♰♰ ♰ ♰ ♰♰♰ ♰ ♰ ♰♰ ♰ ♰♰♰ ♰♰ ♰ │ + 20┤ ♰♰ ♰♰ ♰♰♰ ♰ ♰♰ ♰ ♰♰ ♰ ♰ ♰ ♰ ♰ ♰ ♰♰ │ + 10┤ ♰ ♰ ♰ ♰ ♰ ♰♰ ♰ ♰ ♰♰ │ + 0┤ ♰ ♰ │ + └─┬──┬─┬──┬────┬───┬──┬──┬──┬───┬───┬──┬────┬───┬────┬──┬──┬────┬──┬───┬──┬─┘ + 0 2 5 7 11 16 20 23 27 30 34 38 42 47 51 56 60 63 68 71 76 79 + 131072 bytes + path=unknown-file_extract/0-10485760.unknown pid=1963719 +2024-10-30 10:52.03 [info ] Extracting unknown chunk chunk=0xc96196-0x1696196 path=unknown-file_extract/13197718-23683478.unknown pid=1963719 +2024-10-30 10:52.03 [debug ] Carving chunk path=unknown-file_extract/13197718-23683478.unknown pid=1963719 +2024-10-30 10:52.03 [debug ] Calculating randomness for file path=unknown-file_extract/13197718-23683478.unknown pid=1963719 size=0xa00000 +2024-10-30 10:52.03 [debug ] Shannon entropy calculated block_size=0x20000 highest=99.99 lowest=99.98 mean=99.98 path=unknown-file_extract/13197718-23683478.unknown pid=1963719 size=0xa00000 +2024-10-30 10:52.03 [debug ] Chi square probability calculated block_size=0x20000 highest=99.03 lowest=0.23 mean=42.62 path=unknown-file_extract/13197718-23683478.unknown pid=1963719 size=0xa00000 +2024-10-30 10:52.03 [debug ] Entropy chart chart= + Randomness distribution + ┌───────────────────────────────────────────────────────────────────────────┐ +100┤ •• Shannon entropy (%) •••••••••••••••••••••♰••••••••••••••••••••••│ + 90┤ ♰♰ Chi square probability (%) ♰ ♰♰ ♰ │ + 80┤♰♰ ♰♰ ♰♰ ♰ ♰♰ ♰ ♰♰ ♰ ♰♰ │ + 70┤♰ ♰ ♰ ♰ ♰ ♰ ♰ ♰ ♰ ♰ ♰♰ ♰♰ ♰♰♰ ♰ ♰♰ ♰♰ │ + 60┤ ♰ ♰♰ ♰ ♰ ♰ ♰ ♰♰♰♰♰ ♰♰ ♰♰ ♰♰ ♰ ♰ ♰♰♰ ♰♰ ♰ ♰ ♰♰ ♰ │ + 50┤ ♰ ♰♰♰ ♰ ♰ ♰ ♰ ♰ ♰♰♰♰ ♰ ♰♰ ♰ ♰♰♰ ♰ ♰ ♰ ♰♰♰ ♰♰ ♰ ♰ ♰♰ ♰♰ ♰ │ + 40┤ ♰♰♰♰ ♰♰ ♰♰ ♰ ♰ ♰♰ ♰♰♰ ♰♰♰ ♰♰♰ ♰♰ ♰ ♰ ♰ ♰♰ ♰ ♰♰ ♰ ♰ ♰ ♰ ♰♰♰ ♰♰ │ + 30┤ ♰♰♰♰ ♰♰ ♰♰ ♰♰ ♰♰ ♰♰ ♰♰♰♰♰ ♰♰ ♰ ♰ ♰ ♰♰ ♰♰♰ ♰ ♰ ♰ ♰ ♰ ♰ ♰ ♰│ + 20┤ ♰♰♰ ♰ ♰ ♰♰ ♰♰ ♰♰♰♰ ♰♰ ♰ ♰ ♰ ♰♰ ♰♰ ♰ ♰♰ ♰♰ ♰ ♰ │ + 10┤ ♰ ♰ ♰ ♰ ♰ ♰ ♰ ♰♰ ♰ ♰♰ ♰♰ ♰♰ ♰ ♰ ♰ │ + 0┤ ♰ ♰ ♰♰ ♰ ♰♰ │ + └─┬──┬─┬──┬────┬───┬──┬──┬──┬───┬───┬──┬────┬───┬────┬──┬──┬────┬──┬───┬──┬─┘ + 0 2 5 7 11 16 20 23 27 30 34 38 42 47 51 56 60 63 68 71 76 79 + 131072 bytes ``` ### Skip extraction with file magic diff --git a/fuzzing/search_chunks_fuzzer.py b/fuzzing/search_chunks_fuzzer.py index 64861dc4e3..4e0d83a95c 100755 --- a/fuzzing/search_chunks_fuzzer.py +++ b/fuzzing/search_chunks_fuzzer.py @@ -40,8 +40,8 @@ def test_search_chunks(data): config = ExtractionConfig( extract_root=Path("/dev/shm"), # noqa: S108 force_extract=True, - entropy_depth=0, - entropy_plot=False, + randomness_depth=0, + randomness_plot=False, skip_magic=[], skip_extension=[], skip_extraction=False, diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py index 427b38d3d6..30acf7dfd4 100644 --- a/tests/test_cleanup.py +++ b/tests/test_cleanup.py @@ -50,7 +50,7 @@ def test_remove_extracted_chunks(input_file: Path, output_dir: Path): input_file.write_bytes(ZIP_BYTES) config = ExtractionConfig( extract_root=output_dir, - entropy_depth=0, + randomness_depth=0, ) all_reports = process_file(config, input_file) @@ -62,7 +62,7 @@ def test_keep_all_problematic_chunks(input_file: Path, output_dir: Path): input_file.write_bytes(DAMAGED_ZIP_BYTES) config = ExtractionConfig( extract_root=output_dir, - entropy_depth=0, + randomness_depth=0, ) all_reports = process_file(config, input_file) @@ -75,7 +75,7 @@ def test_keep_all_unknown_chunks(input_file: Path, output_dir: Path): input_file.write_bytes(b"unknown1" + ZIP_BYTES + b"unknown2") config = ExtractionConfig( extract_root=output_dir, - entropy_depth=0, + randomness_depth=0, ) all_reports = process_file(config, input_file) @@ -97,7 +97,7 @@ def test_keep_chunks_with_null_extractor(input_file: Path, output_dir: Path): input_file.write_bytes(b"some text") config = ExtractionConfig( extract_root=output_dir, - entropy_depth=0, + randomness_depth=0, handlers=(_HandlerWithNullExtractor,), ) all_reports = process_file(config, input_file) diff --git a/tests/test_cli.py b/tests/test_cli.py index 720015859a..9f11d3bbf4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -184,7 +184,7 @@ def test_dir_for_file(tmp_path: Path): @pytest.mark.parametrize( - "params, expected_depth, expected_entropy_depth, expected_process_num, expected_verbosity, expected_progress_reporter", + "params, expected_depth, expected_randomness_depth, expected_process_num, expected_verbosity, expected_progress_reporter", [ pytest.param( [], @@ -233,7 +233,7 @@ def test_dir_for_file(tmp_path: Path): def test_archive_success( params, expected_depth: int, - expected_entropy_depth: int, + expected_randomness_depth: int, expected_process_num: int, expected_verbosity: int, expected_progress_reporter: Type[ProgressReporter], @@ -263,8 +263,8 @@ def test_archive_success( config = ExtractionConfig( extract_root=tmp_path, max_depth=expected_depth, - entropy_depth=expected_entropy_depth, - entropy_plot=bool(expected_verbosity >= 3), + randomness_depth=expected_randomness_depth, + randomness_plot=bool(expected_verbosity >= 3), process_num=expected_process_num, handlers=BUILTIN_HANDLERS, verbose=expected_verbosity, diff --git a/tests/test_processing.py b/tests/test_processing.py index 7da07c157f..d1534a8db3 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -2,6 +2,7 @@ import sys import zipfile from pathlib import Path +from statistics import mean from typing import Collection, List, Optional, Tuple, Type, TypeVar import attr @@ -25,20 +26,21 @@ from unblob.processing import ( ExtractionConfig, calculate_block_size, - calculate_entropy, + calculate_randomness, calculate_unknown_chunks, - format_entropy_plot, + format_randomness_plot, process_file, remove_inner_chunks, ) from unblob.report import ( ChunkReport, - EntropyReport, ExtractDirectoryExistsReport, FileMagicReport, HashReport, MultiFileCollisionReport, MultiFileReport, + RandomnessMeasurements, + RandomnessReport, StatReport, UnknownChunkReport, UnknownError, @@ -197,9 +199,16 @@ def test_calculate_block_size( ) -def test_format_entropy_plot_error(): +def test_format_randomness_plot_error(): with pytest.raises(TypeError): - format_entropy_plot(percentages=[], block_size=1024) + format_randomness_plot( + RandomnessReport( + shannon=RandomnessMeasurements(percentages=[], block_size=1024, mean=0), + chi_square=RandomnessMeasurements( + percentages=[], block_size=1024, mean=0 + ), + ) + ) @pytest.mark.parametrize( @@ -213,19 +222,22 @@ def test_format_entropy_plot_error(): pytest.param([100.0] * 100, "None", id="block_size-can-be-anything3"), ], ) -def test_format_entropy_plot_no_exception(percentages: List[float], block_size: int): - assert str(block_size) in format_entropy_plot( - percentages=percentages, - block_size=block_size, +def test_format_randomness_plot_no_exception(percentages: List[float], block_size: int): + assert str(block_size) in format_randomness_plot( + RandomnessReport( + shannon=RandomnessMeasurements( + percentages=percentages, block_size=block_size, mean=mean(percentages) + ), + chi_square=RandomnessMeasurements( + percentages=percentages, block_size=block_size, mean=mean(percentages) + ), + ) ) -def test_calculate_entropy_no_exception(): - report = calculate_entropy(Path(sys.executable)) - format_entropy_plot( - percentages=report.percentages, - block_size=report.block_size, - ) +def test_calculate_randomness_no_exception(): + report = calculate_randomness(Path(sys.executable)) + format_randomness_plot(report) @pytest.mark.parametrize( @@ -250,7 +262,7 @@ def test_calculate_entropy_no_exception(): def test_ExtractionConfig_get_extract_dir_for( # noqa: N802 extract_root: str, path: str, result: str ): - cfg = ExtractionConfig(extract_root=Path(extract_root), entropy_depth=0) + cfg = ExtractionConfig(extract_root=Path(extract_root), randomness_depth=0) assert cfg.get_extract_dir_for(Path(path)) == Path(result) @@ -310,7 +322,7 @@ def test_process_file_prevents_double_extracts(tmp_path: Path, fw: Path): # ├── hello # └── world fw_extract_root = tmp_path / "fw_extract_root" - config = ExtractionConfig(extract_root=fw_extract_root, entropy_depth=0) + config = ExtractionConfig(extract_root=fw_extract_root, randomness_depth=0) process_result = process_file(config, fw) assert process_result.errors == [] extracted_fw_paths, outsiders = sort_paths( @@ -331,7 +343,9 @@ def test_process_file_prevents_double_extracts(tmp_path: Path, fw: Path): # ├── hello # └── world fw_extract_of_extract_root = tmp_path / "fw_extract_of_extract_root" - config = ExtractionConfig(extract_root=fw_extract_of_extract_root, entropy_depth=0) + config = ExtractionConfig( + extract_root=fw_extract_of_extract_root, randomness_depth=0 + ) process_result = process_file(config, extracted_fw_zip) # we expect exactly 1 problem reported, related to the extraction of "internal.zip" @@ -364,7 +378,7 @@ def test_processing_with_non_posix_paths(tmp_path: Path): file_with_non_unicode_dir.write_bytes(b"content") extract_root = tmp_path / "extract_root" - config = ExtractionConfig(extract_root=extract_root, entropy_depth=0) + config = ExtractionConfig(extract_root=extract_root, randomness_depth=0) for path in (non_unicode_file, file_with_non_unicode_dir): process_result = process_file(config, path) @@ -384,8 +398,8 @@ def test_processing_with_non_posix_paths(tmp_path: Path): ) -def test_entropy_calculation(tmp_path: Path): - """Process a file with unknown chunk and a zip file with entropy calculation enabled. +def test_randomness_calculation(tmp_path: Path): + """Process a file with unknown chunk and a zip file with randomness calculation enabled. The input file structure is - zip-chunk @@ -411,8 +425,8 @@ def test_entropy_calculation(tmp_path: Path): config = ExtractionConfig( extract_root=tmp_path / "extract_root", - entropy_depth=100, - entropy_plot=True, + randomness_depth=100, + randomness_plot=True, handlers=(handlers.archive.zip.ZIPHandler,), ) @@ -427,24 +441,36 @@ def get_all(file_name, report_type: Type[ReportType]) -> List[ReportType]: # ** verification - # the unknown chunk report for the second chunk for the input file should have an entropy report + # the unknown chunk report for the second chunk for the input file should have a randomness report # with a percentages (scaled up bits) of 64 items, for 0, 6, 8, 8, ... bits of entropies [unknown_chunk_report] = get_all("input-file", UnknownChunkReport) - unknown_entropy = unknown_chunk_report.entropy + unknown_randomness = unknown_chunk_report.randomness assert ( - unknown_entropy is not None + unknown_randomness is not None ) # removes pyright complaints for the below lines :( - assert unknown_entropy.percentages == [0.0, 75.0] + [100.0] * 62 - assert unknown_entropy.block_size == 1024 - assert round(unknown_entropy.mean, 2) == 98.05 # noqa: PLR2004 - assert unknown_entropy.highest == 100.0 # noqa: PLR2004 - assert unknown_entropy.lowest == 0.0 - - # we should have entropy calculated for files without extractions, except for empty files - assert get_all("empty.txt", EntropyReport) == [] - assert [EntropyReport(percentages=[100.0], block_size=1024, mean=100.0)] == get_all( - "0-255.bin", EntropyReport - ) + assert unknown_randomness.shannon.percentages == [0.0, 75.0] + [100.0] * 62 + assert unknown_randomness.shannon.block_size == 1024 + assert round(unknown_randomness.shannon.mean, 2) == 98.05 # noqa: PLR2004 + assert unknown_randomness.shannon.highest == 100.0 # noqa: PLR2004 + assert unknown_randomness.shannon.lowest == 0.0 + assert unknown_randomness.chi_square.percentages == [0.0, 0.0] + [100.0] * 62 + assert unknown_randomness.chi_square.block_size == 1024 + assert round(unknown_randomness.shannon.mean, 2) == 98.05 # noqa: PLR2004 + assert unknown_randomness.chi_square.highest == 100.0 # noqa: PLR2004 + assert unknown_randomness.chi_square.lowest == 0.0 + + # we should have randomness calculated for files without extractions, except for empty files + assert get_all("empty.txt", RandomnessReport) == [] + assert [ + RandomnessReport( + shannon=RandomnessMeasurements( + percentages=[100.0], block_size=1024, mean=100.0 + ), + chi_square=RandomnessMeasurements( + percentages=[100.0], block_size=1024, mean=100.0 + ), + ) + ] == get_all("0-255.bin", RandomnessReport) @pytest.mark.parametrize( @@ -580,7 +606,7 @@ def extraction_root(tmp_path: Path): def multi_file_extraction_config(extraction_root: Path): return ExtractionConfig( extract_root=extraction_root, - entropy_depth=0, + randomness_depth=0, handlers=(handlers.archive.zip.ZIPHandler, DummyTestHandler), dir_handlers=(SplitDirHandler,), ) diff --git a/tests/test_report.py b/tests/test_report.py index 0a2ee0d39d..8cc16565f1 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -40,7 +40,7 @@ def test_process_file_report_output_is_valid_json( ): assert not report_file.exists() - config = ExtractionConfig(extract_root=extract_root, entropy_depth=0) + config = ExtractionConfig(extract_root=extract_root, randomness_depth=0) process_file(config, input_file, report_file) # output must be a valid json file, that is not empty @@ -125,14 +125,14 @@ def hello_kitty_task_results( start_offset=0, end_offset=6, size=6, - entropy=None, + randomness=None, ), UnknownChunkReport( id=ANY, start_offset=131, end_offset=138, size=7, - entropy=None, + randomness=None, ), ChunkReport( id=padding_id, @@ -284,7 +284,7 @@ def hello_kitty_task_results( def test_flat_report_structure(hello_kitty: Path, extract_root): - config = ExtractionConfig(extract_root=extract_root, entropy_depth=0) + config = ExtractionConfig(extract_root=extract_root, randomness_depth=0) process_result = process_file(config, hello_kitty) task_results = get_normalized_task_results(process_result) @@ -408,7 +408,7 @@ def hello_kitty_container(tmp_path: Path, hello_kitty: Path) -> Path: def test_chunk_in_chunk_report_structure(hello_kitty_container: Path, extract_root): - config = ExtractionConfig(extract_root=extract_root, entropy_depth=0) + config = ExtractionConfig(extract_root=extract_root, randomness_depth=0) process_result = process_file(config, hello_kitty_container) task_results = get_normalized_task_results(process_result) diff --git a/unblob/cli.py b/unblob/cli.py index cb275e809c..72c7d81fd6 100755 --- a/unblob/cli.py +++ b/unblob/cli.py @@ -150,12 +150,12 @@ def __init__( ) @click.option( "-n", - "--entropy-depth", + "--randomness-depth", type=click.IntRange(0), default=1, show_default=True, help=( - "Entropy calculation depth. How deep should we calculate entropy for unknown files? " + "Entropy calculation depth. How deep should we calculate randomness for unknown files? " "1 means input files only, 0 turns it off." ), ) @@ -257,7 +257,7 @@ def cli( force: bool, # noqa: FBT001 process_num: int, depth: int, - entropy_depth: int, + randomness_depth: int, skip_magic: Iterable[str], skip_extension: Iterable[str], clear_skip_magics: bool, # noqa: FBT001 @@ -285,8 +285,8 @@ def cli( extract_root=extract_root, force_extract=force, max_depth=depth, - entropy_depth=entropy_depth, - entropy_plot=bool(verbose >= 3), + randomness_depth=randomness_depth, + randomness_plot=bool(verbose >= 3), skip_extraction=skip_extraction, skip_magic=skip_magic, skip_extension=skip_extension, diff --git a/unblob/models.py b/unblob/models.py index 800d9d4779..52e5afcc8e 100644 --- a/unblob/models.py +++ b/unblob/models.py @@ -14,9 +14,9 @@ from .parser import hexstring2regex from .report import ( ChunkReport, - EntropyReport, ErrorReport, MultiFileReport, + RandomnessReport, Report, UnknownChunkReport, ) @@ -134,19 +134,19 @@ class UnknownChunk(Chunk): r"""Gaps between valid chunks or otherwise unknown chunks. Important for manual analysis, and analytical certainty: for example - entropy, other chunks inside it, metadata, etc. + randomness, other chunks inside it, metadata, etc. These are not extracted, just logged for information purposes and further analysis, - like most common bytes (like \x00 and \xFF), ASCII strings, high entropy, etc. + like most common bytes (like \x00 and \xFF), ASCII strings, high randomness, etc. """ - def as_report(self, entropy: Optional[EntropyReport]) -> UnknownChunkReport: + def as_report(self, randomness: Optional[RandomnessReport]) -> UnknownChunkReport: return UnknownChunkReport( id=self.id, start_offset=self.start_offset, end_offset=self.end_offset, size=self.size, - entropy=entropy, + randomness=randomness, ) @@ -155,12 +155,12 @@ class PaddingChunk(Chunk): r"""Gaps between valid chunks or otherwise unknown chunks. Important for manual analysis, and analytical certanity: for example - entropy, other chunks inside it, metadata, etc. + randomness, other chunks inside it, metadata, etc. """ def as_report( self, - entropy: Optional[EntropyReport], # noqa: ARG002 + randomness: Optional[RandomnessReport], # noqa: ARG002 ) -> ChunkReport: return ChunkReport( id=self.id, diff --git a/unblob/processing.py b/unblob/processing.py index 4b95ea7651..393807842d 100644 --- a/unblob/processing.py +++ b/unblob/processing.py @@ -34,11 +34,12 @@ from .pool import make_pool from .report import ( CalculateMultiFileExceptionReport, - EntropyReport, ExtractDirectoryExistsReport, FileMagicReport, HashReport, MultiFileCollisionReport, + RandomnessMeasurements, + RandomnessReport, Report, StatReport, UnknownError, @@ -85,8 +86,8 @@ class ExtractionConfig: extract_root: Path = attr.field(converter=lambda value: value.resolve()) force_extract: bool = False - entropy_depth: int - entropy_plot: bool = False + randomness_depth: int + randomness_plot: bool = False max_depth: int = DEFAULT_DEPTH skip_magic: Iterable[str] = DEFAULT_SKIP_MAGIC skip_extension: Iterable[str] = DEFAULT_SKIP_EXTENSION @@ -527,10 +528,10 @@ def process(self): self._process_chunks(file, outer_chunks, unknown_chunks) else: # we don't consider whole files as unknown chunks, but we still want to - # calculate entropy for whole files which produced no valid chunks - entropy = self._calculate_entropy(self.task.path) - if entropy: - self.result.add_report(entropy) + # calculate randomness for whole files which produced no valid chunks + randomness = self._calculate_randomness(self.task.path) + if randomness: + self.result.add_report(randomness) def _process_chunks( self, @@ -543,28 +544,27 @@ def _process_chunks( if self.config.skip_extraction: for chunk in unknown_chunks: - self.result.add_report(chunk.as_report(entropy=None)) + self.result.add_report(chunk.as_report(randomness=None)) for chunk in outer_chunks: self.result.add_report(chunk.as_report(extraction_reports=[])) return for chunk in unknown_chunks: carved_unknown_path = carve_unknown_chunk(self.carve_dir, file, chunk) - entropy = self._calculate_entropy(carved_unknown_path) - self.result.add_report(chunk.as_report(entropy=entropy)) + randomness = self._calculate_randomness(carved_unknown_path) + self.result.add_report(chunk.as_report(randomness=randomness)) for chunk in outer_chunks: self._extract_chunk(file, chunk) - def _calculate_entropy(self, path: Path) -> Optional[EntropyReport]: - if self.task.depth < self.config.entropy_depth: - report = calculate_entropy(path) - if self.config.entropy_plot: + def _calculate_randomness(self, path: Path) -> Optional[RandomnessReport]: + if self.task.depth < self.config.randomness_depth: + report = calculate_randomness(path) + if self.config.randomness_plot: logger.debug( - "Entropy chart", + "Randomness chart", # New line so that chart title will be aligned correctly in the next line - chart="\n" - + format_entropy_plot(report.percentages, report.block_size), + chart="\n" + format_randomness_plot(report), path=path, _verbosity=3, ) @@ -701,15 +701,20 @@ def calculate_unknown_chunks( return unknown_chunks -def calculate_entropy(path: Path) -> EntropyReport: +def calculate_randomness(path: Path) -> RandomnessReport: """Calculate and log shannon entropy divided by 8 for the file in chunks. Shannon entropy returns the amount of information (in bits) of some numeric sequence. We calculate the average entropy of byte chunks, which in theory can contain 0-8 bits of entropy. We normalize it for visualization to a 0-100% scale, to make it easier to interpret the graph. + + The chi square distribution is calculated for the stream of bytes in the + chunk and expressed as an absolute number and a percentage which indicates + how frequently a truly random sequence would exceed the value calculated. """ - percentages = [] + shannon_percentages = [] + chi_square_percentages = [] # We could use the chunk size instead of another syscall, # but we rely on the actual file size written to the disk @@ -725,28 +730,50 @@ def calculate_entropy(path: Path) -> EntropyReport: max_limit=1024 * 1024, ) - entropy_sum = 0.0 + shannon_entropy_sum = 0.0 + chisquare_probability_sum = 0.0 with File.from_path(path) as file: for chunk in iterate_file(file, 0, file_size, buffer_size=block_size): - entropy = mt.shannon_entropy(chunk) - entropy_percentage = round(entropy / 8 * 100, 2) - percentages.append(entropy_percentage) - entropy_sum += entropy * len(chunk) - - report = EntropyReport( - percentages=percentages, - block_size=block_size, - mean=entropy_sum / file_size / 8 * 100, + shannon_entropy = mt.shannon_entropy(chunk) + shannon_entropy_percentage = round(shannon_entropy / 8 * 100, 2) + shannon_percentages.append(shannon_entropy_percentage) + shannon_entropy_sum += shannon_entropy * len(chunk) + + chi_square_probability = mt.chi_square_probability(chunk) + chisquare_probability_percentage = round(chi_square_probability * 100, 2) + chi_square_percentages.append(chisquare_probability_percentage) + chisquare_probability_sum += chi_square_probability * len(chunk) + + report = RandomnessReport( + shannon=RandomnessMeasurements( + percentages=shannon_percentages, + block_size=block_size, + mean=shannon_entropy_sum / file_size / 8 * 100, + ), + chi_square=RandomnessMeasurements( + percentages=chi_square_percentages, + block_size=block_size, + mean=chisquare_probability_sum / file_size * 100, + ), ) logger.debug( - "Entropy calculated", + "Shannon entropy calculated", + path=path, + size=file_size, + block_size=report.shannon.block_size, + mean=round(report.shannon.mean, 2), + highest=round(report.shannon.highest, 2), + lowest=round(report.shannon.lowest, 2), + ) + logger.debug( + "Chi square probability calculated", path=path, size=file_size, - block_size=report.block_size, - mean=round(report.mean, 2), - highest=round(report.highest, 2), - lowest=round(report.lowest, 2), + block_size=report.chi_square.block_size, + mean=round(report.chi_square.mean, 2), + highest=round(report.chi_square.highest, 2), + lowest=round(report.chi_square.lowest, 2), ) return report @@ -763,22 +790,25 @@ def calculate_block_size( return block_size # noqa: RET504 -def format_entropy_plot(percentages: List[float], block_size: int): +def format_randomness_plot(report: RandomnessReport): # start from scratch plt.clear_figure() # go colorless plt.clear_color() plt.title("Entropy distribution") - # plt.xlabel(humanize.naturalsize(block_size)) - plt.xlabel(f"{block_size} bytes") - plt.ylabel("entropy %") + plt.xlabel(f"{report.shannon.block_size} bytes") - plt.scatter(percentages, marker="dot") + plt.plot(report.shannon.percentages, label="Shannon entropy (%)", marker="dot") + plt.plot( + report.chi_square.percentages, + label="Chi square probability (%)", + marker="cross", + ) # 16 height leaves no gaps between the lines plt.plot_size(100, 16) plt.ylim(0, 100) # Draw ticks every 1Mb on the x axis. - plt.xticks(range(len(percentages) + 1)) + plt.xticks(range(len(report.shannon.percentages) + 1)) # Always show 0% and 100% plt.yticks(range(0, 101, 10)) diff --git a/unblob/report.py b/unblob/report.py index 2d241661d3..4c38abee4a 100644 --- a/unblob/report.py +++ b/unblob/report.py @@ -191,7 +191,7 @@ class FileMagicReport(Report): @attr.define(kw_only=True, frozen=True) -class EntropyReport(Report): +class RandomnessMeasurements: percentages: List[float] block_size: int mean: float @@ -205,6 +205,12 @@ def lowest(self): return min(self.percentages) +@attr.define(kw_only=True, frozen=True) +class RandomnessReport(Report): + shannon: RandomnessMeasurements + chi_square: RandomnessMeasurements + + @final @attr.define(kw_only=True, frozen=True) class ChunkReport(Report): @@ -224,7 +230,7 @@ class UnknownChunkReport(Report): start_offset: int end_offset: int size: int - entropy: Optional[EntropyReport] + randomness: Optional[RandomnessReport] @final diff --git a/unblob/testing.py b/unblob/testing.py index 75a786df34..82c83123b5 100644 --- a/unblob/testing.py +++ b/unblob/testing.py @@ -57,7 +57,7 @@ def gather_integration_tests(test_data_path: Path): def extraction_config(tmp_path: Path): config = ExtractionConfig( extract_root=tmp_path, - entropy_depth=0, + randomness_depth=0, keep_extracted_chunks=True, )