From d8b7d5ce98e51b7f559759fde7ba397f603d5215 Mon Sep 17 00:00:00 2001 From: insolor <2442833+insolor@users.noreply.github.com> Date: Fri, 8 Nov 2024 00:09:15 +0300 Subject: [PATCH 01/11] Some optimizations in extract_strings_from_raw_bytes --- .../extract_strings/from_raw_bytes.py | 51 ++++++++----------- tests/test_extract_strings.py | 7 +-- 2 files changed, 26 insertions(+), 32 deletions(-) diff --git a/dfint64_patch/extract_strings/from_raw_bytes.py b/dfint64_patch/extract_strings/from_raw_bytes.py index e143ed9..db1053f 100644 --- a/dfint64_patch/extract_strings/from_raw_bytes.py +++ b/dfint64_patch/extract_strings/from_raw_bytes.py @@ -20,38 +20,24 @@ def is_allowed(x: int) -> bool: return x in allowed or (ord(" ") <= x <= ASCII_MAX_CODE and x not in forbidden) -def possible_to_decode(c: bytes, encoding: str) -> bool: - try: - c.decode(encoding=encoding) - except UnicodeDecodeError: - return False - else: - return True - - -def check_string(buf: bytes | memoryview, encoding: str) -> tuple[int, int]: +def check_string(buf: bytes | memoryview) -> int | None: """ - Try to decode bytes as a string in the given encoding + Check that the buffer contain letters and doesn't contain forbidden characters + :param buf: byte buffer - :param encoding: string encoding - :return: (string_length: int, number_of_letters: int) + :return: number_of_letters: int """ - string_length = 0 number_of_letters = 0 for i, c in enumerate(buf): - if c == 0: - string_length = i - break - current_byte = bytes(buf[i : i + 1]) - if not is_allowed(c) or not possible_to_decode(current_byte, encoding): - break + if not is_allowed(c): + return None if current_byte.isalpha(): number_of_letters += 1 - return string_length, number_of_letters + return number_of_letters class ExtractedStringInfo(NamedTuple): @@ -76,13 +62,20 @@ def extract_strings_from_raw_bytes( view = memoryview(bytes_block) i = 0 - while i < len(view): - buffer_part = view[i:] - string_len, letters = check_string(buffer_part, encoding) - if string_len and letters: - string = bytes(view[i : i + string_len]).decode(encoding) - yield ExtractedStringInfo(Rva(base_address + i), string) - i += (string_len // alignment + 1) * alignment + while i < len(bytes_block): + if bytes_block[i] == b"\0": + i += alignment continue - i += alignment + end_index = bytes_block.index(b"\0", i) + string_len = end_index - i + buffer_part = view[i:end_index] + + if check_string(buffer_part): + try: + string = bytes(buffer_part).decode(encoding) + yield ExtractedStringInfo(Rva(base_address + i), string) + except UnicodeDecodeError: + pass + + i += (string_len // alignment + 1) * alignment diff --git a/tests/test_extract_strings.py b/tests/test_extract_strings.py index 5633e79..819c0d4 100644 --- a/tests/test_extract_strings.py +++ b/tests/test_extract_strings.py @@ -21,12 +21,13 @@ @pytest.mark.parametrize( ("test_data", "encoding", "expected"), [ - (b"12345\0", "cp437", (5, 0)), - (b"12345\xff\0", "utf-8", (0, 0)), + (b"12345", "cp437", 0), + (b"12345\xff", "utf-8", None), + (b"1234abc5", "utf-8", 3), ], ) def test_check_string(test_data: bytes, encoding: str, expected: tuple[int, int]): - assert check_string(test_data, encoding) == expected + assert check_string(test_data) == expected @pytest.mark.parametrize( From 91a440125c06545e7cc65cc8360745bad2401654 Mon Sep 17 00:00:00 2001 From: insolor <2442833+insolor@users.noreply.github.com> Date: Fri, 8 Nov 2024 00:22:21 +0300 Subject: [PATCH 02/11] Refactor extract_strings_from_raw_bytes code --- .../extract_strings/from_raw_bytes.py | 33 +++++++------------ tests/test_extract_strings.py | 10 +++--- 2 files changed, 16 insertions(+), 27 deletions(-) diff --git a/dfint64_patch/extract_strings/from_raw_bytes.py b/dfint64_patch/extract_strings/from_raw_bytes.py index db1053f..d869420 100644 --- a/dfint64_patch/extract_strings/from_raw_bytes.py +++ b/dfint64_patch/extract_strings/from_raw_bytes.py @@ -10,34 +10,23 @@ from dfint64_patch.type_aliases import RVA0, Rva -forbidden: set[int] = set(b"$^@") -allowed: set[int] = set() +forbidden: str = "$^@" -ASCII_MAX_CODE = 127 +ASCII_MAX_CHAR = chr(127) -def is_allowed(x: int) -> bool: - return x in allowed or (ord(" ") <= x <= ASCII_MAX_CODE and x not in forbidden) +def is_allowed(c: str) -> bool: + return " " <= c <= ASCII_MAX_CHAR and c not in forbidden -def check_string(buf: bytes | memoryview) -> int | None: +def check_string(buf: str) -> bool: """ Check that the buffer contain letters and doesn't contain forbidden characters :param buf: byte buffer :return: number_of_letters: int """ - - number_of_letters = 0 - for i, c in enumerate(buf): - current_byte = bytes(buf[i : i + 1]) - if not is_allowed(c): - return None - - if current_byte.isalpha(): - number_of_letters += 1 - - return number_of_letters + return any(c.isalpha() for c in buf) and all(is_allowed(c) for c in buf) class ExtractedStringInfo(NamedTuple): @@ -71,11 +60,11 @@ def extract_strings_from_raw_bytes( string_len = end_index - i buffer_part = view[i:end_index] - if check_string(buffer_part): - try: - string = bytes(buffer_part).decode(encoding) + try: + string = bytes(buffer_part).decode(encoding) + if check_string(string): yield ExtractedStringInfo(Rva(base_address + i), string) - except UnicodeDecodeError: - pass + except UnicodeDecodeError: + pass i += (string_len // alignment + 1) * alignment diff --git a/tests/test_extract_strings.py b/tests/test_extract_strings.py index 819c0d4..1636dba 100644 --- a/tests/test_extract_strings.py +++ b/tests/test_extract_strings.py @@ -19,14 +19,14 @@ @pytest.mark.parametrize( - ("test_data", "encoding", "expected"), + ("test_data", "expected"), [ - (b"12345", "cp437", 0), - (b"12345\xff", "utf-8", None), - (b"1234abc5", "utf-8", 3), + ("12345", False), + ("12345\xff", False), + ("1234abc5", True), ], ) -def test_check_string(test_data: bytes, encoding: str, expected: tuple[int, int]): +def test_check_string(test_data: str, expected: bool): assert check_string(test_data) == expected From a0ad351a49398de0654efe497f15ffe9018f5c45 Mon Sep 17 00:00:00 2001 From: insolor <2442833+insolor@users.noreply.github.com> Date: Fri, 8 Nov 2024 00:24:45 +0300 Subject: [PATCH 03/11] Remove memovryview usage --- dfint64_patch/cross_references/cross_references_relative.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dfint64_patch/cross_references/cross_references_relative.py b/dfint64_patch/cross_references/cross_references_relative.py index b4cd588..6b214cc 100644 --- a/dfint64_patch/cross_references/cross_references_relative.py +++ b/dfint64_patch/cross_references/cross_references_relative.py @@ -22,14 +22,13 @@ def find_relative_cross_references( (e.g. `range(0x11000, 0x12000)`) or dict object. :return: Mapping[object_rva: Rva, cross_references: List[Rva]] """ - view = memoryview(bytes_block) result = defaultdict(list) if not isinstance(addresses, range | dict): addresses = set(addresses) for i in tqdm(range(len(bytes_block) - REFERENCE_SIZE + 1), desc="find_relative_cross_references"): - relative_offset = int.from_bytes(bytes(view[i : i + REFERENCE_SIZE]), byteorder="little", signed=True) + relative_offset = int.from_bytes(bytes_block[i : i + REFERENCE_SIZE], byteorder="little", signed=True) destination = Rva(base_address + i + REFERENCE_SIZE + relative_offset) From 679188fe50834d8e8246e241e1cadcf15b326d32 Mon Sep 17 00:00:00 2001 From: insolor <2442833+insolor@users.noreply.github.com> Date: Fri, 8 Nov 2024 00:39:52 +0300 Subject: [PATCH 04/11] Remove casts to Rva in find_relative_cross_references --- dfint64_patch/cross_references/cross_references_relative.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dfint64_patch/cross_references/cross_references_relative.py b/dfint64_patch/cross_references/cross_references_relative.py index 6b214cc..6709815 100644 --- a/dfint64_patch/cross_references/cross_references_relative.py +++ b/dfint64_patch/cross_references/cross_references_relative.py @@ -30,10 +30,10 @@ def find_relative_cross_references( for i in tqdm(range(len(bytes_block) - REFERENCE_SIZE + 1), desc="find_relative_cross_references"): relative_offset = int.from_bytes(bytes_block[i : i + REFERENCE_SIZE], byteorder="little", signed=True) - destination = Rva(base_address + i + REFERENCE_SIZE + relative_offset) + destination = base_address + i + REFERENCE_SIZE + relative_offset if destination in addresses: - result[destination].append(Rva(base_address + i)) + result[destination].append(base_address + i) return result From d235d4db3f3c493995ec773544616a61af1c0d5a Mon Sep 17 00:00:00 2001 From: insolor <2442833+insolor@users.noreply.github.com> Date: Fri, 8 Nov 2024 00:52:21 +0300 Subject: [PATCH 05/11] Move some code into a separate find_relative_cross_references_low function --- .../cross_references_relative.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/dfint64_patch/cross_references/cross_references_relative.py b/dfint64_patch/cross_references/cross_references_relative.py index 6709815..a188832 100644 --- a/dfint64_patch/cross_references/cross_references_relative.py +++ b/dfint64_patch/cross_references/cross_references_relative.py @@ -9,6 +9,17 @@ REFERENCE_SIZE = 4 +def find_relative_cross_references_low( + bytes_block: bytes, base_address: Rva, addresses: range | dict | set +) -> Iterator[tuple[int, int]]: + for i in range(len(bytes_block) - REFERENCE_SIZE + 1): + relative_offset = int.from_bytes(bytes_block[i : i + REFERENCE_SIZE], byteorder="little", signed=True) + destination = base_address + i + REFERENCE_SIZE + relative_offset + + if destination in addresses: + yield destination, base_address + i + + def find_relative_cross_references( bytes_block: bytes, base_address: Rva, @@ -27,13 +38,11 @@ def find_relative_cross_references( if not isinstance(addresses, range | dict): addresses = set(addresses) - for i in tqdm(range(len(bytes_block) - REFERENCE_SIZE + 1), desc="find_relative_cross_references"): - relative_offset = int.from_bytes(bytes_block[i : i + REFERENCE_SIZE], byteorder="little", signed=True) - - destination = base_address + i + REFERENCE_SIZE + relative_offset - - if destination in addresses: - result[destination].append(base_address + i) + for destination, source in tqdm( + find_relative_cross_references_low(bytes_block, base_address, addresses), + desc="find_relative_cross_references", + ): + result[destination].append(source) return result From a390cb604fd917ee19d12eee3cebf87823016e34 Mon Sep 17 00:00:00 2001 From: insolor <2442833+insolor@users.noreply.github.com> Date: Fri, 8 Nov 2024 00:59:16 +0300 Subject: [PATCH 06/11] Remove memoryview usage --- dfint64_patch/extract_strings/from_raw_bytes.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dfint64_patch/extract_strings/from_raw_bytes.py b/dfint64_patch/extract_strings/from_raw_bytes.py index d869420..aa02b23 100644 --- a/dfint64_patch/extract_strings/from_raw_bytes.py +++ b/dfint64_patch/extract_strings/from_raw_bytes.py @@ -48,8 +48,6 @@ def extract_strings_from_raw_bytes( :param encoding: string encoding :return: Iterator[ExtractedStringInfo] """ - view = memoryview(bytes_block) - i = 0 while i < len(bytes_block): if bytes_block[i] == b"\0": @@ -58,7 +56,7 @@ def extract_strings_from_raw_bytes( end_index = bytes_block.index(b"\0", i) string_len = end_index - i - buffer_part = view[i:end_index] + buffer_part = bytes_block[i:end_index] try: string = bytes(buffer_part).decode(encoding) From 6e15a67f64de6a44f72ef33e0ee71dfbec2663dc Mon Sep 17 00:00:00 2001 From: insolor <2442833+insolor@users.noreply.github.com> Date: Fri, 8 Nov 2024 01:01:14 +0300 Subject: [PATCH 07/11] Move string_len calculation --- dfint64_patch/extract_strings/from_raw_bytes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dfint64_patch/extract_strings/from_raw_bytes.py b/dfint64_patch/extract_strings/from_raw_bytes.py index aa02b23..bf81ea9 100644 --- a/dfint64_patch/extract_strings/from_raw_bytes.py +++ b/dfint64_patch/extract_strings/from_raw_bytes.py @@ -55,7 +55,6 @@ def extract_strings_from_raw_bytes( continue end_index = bytes_block.index(b"\0", i) - string_len = end_index - i buffer_part = bytes_block[i:end_index] try: @@ -65,4 +64,5 @@ def extract_strings_from_raw_bytes( except UnicodeDecodeError: pass + string_len = end_index - i i += (string_len // alignment + 1) * alignment From a6d05703366e26013f0c4abc3ca08733168275b8 Mon Sep 17 00:00:00 2001 From: insolor <2442833+insolor@users.noreply.github.com> Date: Fri, 8 Nov 2024 01:03:27 +0300 Subject: [PATCH 08/11] Fix mypy error --- dfint64_patch/cross_references/cross_references_relative.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dfint64_patch/cross_references/cross_references_relative.py b/dfint64_patch/cross_references/cross_references_relative.py index a188832..fafe1d9 100644 --- a/dfint64_patch/cross_references/cross_references_relative.py +++ b/dfint64_patch/cross_references/cross_references_relative.py @@ -10,7 +10,7 @@ def find_relative_cross_references_low( - bytes_block: bytes, base_address: Rva, addresses: range | dict | set + bytes_block: bytes, base_address: Rva, addresses: Iterable[int] ) -> Iterator[tuple[int, int]]: for i in range(len(bytes_block) - REFERENCE_SIZE + 1): relative_offset = int.from_bytes(bytes_block[i : i + REFERENCE_SIZE], byteorder="little", signed=True) From 78bb49f47a01ed4aeb1db94fa0fa915685aac636 Mon Sep 17 00:00:00 2001 From: insolor <2442833+insolor@users.noreply.github.com> Date: Fri, 8 Nov 2024 09:21:21 +0300 Subject: [PATCH 09/11] Add docs to find_relative_cross_references_low --- .../cross_references/cross_references_relative.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dfint64_patch/cross_references/cross_references_relative.py b/dfint64_patch/cross_references/cross_references_relative.py index fafe1d9..c2fc437 100644 --- a/dfint64_patch/cross_references/cross_references_relative.py +++ b/dfint64_patch/cross_references/cross_references_relative.py @@ -12,6 +12,15 @@ def find_relative_cross_references_low( bytes_block: bytes, base_address: Rva, addresses: Iterable[int] ) -> Iterator[tuple[int, int]]: + """ + Analyse a block of bytes and try to find relative cross-references to the given objects' addresses. + Optimized hot loop, don't add extra stuff to the loop (like conversion to Rva etc.) + + :param bytes_block: bytes block to analyse + :param base_address: base address of the given block + :param addresses: an iterable of destination addresses + :return: pairs of destinations and source addresses + """ for i in range(len(bytes_block) - REFERENCE_SIZE + 1): relative_offset = int.from_bytes(bytes_block[i : i + REFERENCE_SIZE], byteorder="little", signed=True) destination = base_address + i + REFERENCE_SIZE + relative_offset From ec5a3a5ca75458a04a7c553e78aacad673b671f7 Mon Sep 17 00:00:00 2001 From: insolor <2442833+insolor@users.noreply.github.com> Date: Fri, 8 Nov 2024 09:52:52 +0300 Subject: [PATCH 10/11] Rename --- dfint64_patch/cross_references/cross_references_relative.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dfint64_patch/cross_references/cross_references_relative.py b/dfint64_patch/cross_references/cross_references_relative.py index c2fc437..00132e3 100644 --- a/dfint64_patch/cross_references/cross_references_relative.py +++ b/dfint64_patch/cross_references/cross_references_relative.py @@ -9,7 +9,7 @@ REFERENCE_SIZE = 4 -def find_relative_cross_references_low( +def find_relative_cross_references_loop( bytes_block: bytes, base_address: Rva, addresses: Iterable[int] ) -> Iterator[tuple[int, int]]: """ @@ -48,7 +48,7 @@ def find_relative_cross_references( addresses = set(addresses) for destination, source in tqdm( - find_relative_cross_references_low(bytes_block, base_address, addresses), + find_relative_cross_references_loop(bytes_block, base_address, addresses), desc="find_relative_cross_references", ): result[destination].append(source) From 45e6ea2813e89f8e85ebf7b0f4c3b526f60f5152 Mon Sep 17 00:00:00 2001 From: insolor <2442833+insolor@users.noreply.github.com> Date: Fri, 8 Nov 2024 09:55:09 +0300 Subject: [PATCH 11/11] docs --- dfint64_patch/cross_references/cross_references_relative.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dfint64_patch/cross_references/cross_references_relative.py b/dfint64_patch/cross_references/cross_references_relative.py index 00132e3..0f5d1ba 100644 --- a/dfint64_patch/cross_references/cross_references_relative.py +++ b/dfint64_patch/cross_references/cross_references_relative.py @@ -17,7 +17,8 @@ def find_relative_cross_references_loop( Optimized hot loop, don't add extra stuff to the loop (like conversion to Rva etc.) :param bytes_block: bytes block to analyse - :param base_address: base address of the given block + :param base_address: base address of the given block (preferably this should be of some type with fast "in" check, + like set, dict, range or short tuple) :param addresses: an iterable of destination addresses :return: pairs of destinations and source addresses """