From d01593e571264e02f3bfc66184cf868bbc3cfca0 Mon Sep 17 00:00:00 2001 From: Taus Date: Fri, 18 Oct 2024 14:59:58 +0000 Subject: [PATCH 1/3] Python: Add test for string encoding dataset check Note that this test checks that the current setup creates dataset check violations. A later commit will fix this (and flip the negation in the test). --- .../string-encoding/repo_dir/test.py | 2 ++ .../string-encoding/test.sh | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 python/extractor/cli-integration-test/string-encoding/repo_dir/test.py create mode 100755 python/extractor/cli-integration-test/string-encoding/test.sh diff --git a/python/extractor/cli-integration-test/string-encoding/repo_dir/test.py b/python/extractor/cli-integration-test/string-encoding/repo_dir/test.py new file mode 100644 index 000000000000..8e7efcaf9260 --- /dev/null +++ b/python/extractor/cli-integration-test/string-encoding/repo_dir/test.py @@ -0,0 +1,2 @@ +"\uD800" +"?" diff --git a/python/extractor/cli-integration-test/string-encoding/test.sh b/python/extractor/cli-integration-test/string-encoding/test.sh new file mode 100755 index 000000000000..a8cf5afb824b --- /dev/null +++ b/python/extractor/cli-integration-test/string-encoding/test.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/ + +set -x + +CODEQL=${CODEQL:-codeql} + +SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +cd "$SCRIPTDIR" + +rm -rf db + +$CODEQL database create db --language python --source-root repo_dir/ + +# Note the negation in front -- it witnesses the fact that currently the dataset check FAILS. +! $CODEQL dataset check db/db-python + +echo "Test successfully completed." From cc39ae57dc67a3bbc534346fd783ac01d011e49e Mon Sep 17 00:00:00 2001 From: Taus Date: Mon, 21 Oct 2024 15:31:16 +0000 Subject: [PATCH 2/3] Python: Fix dataset check error for string encoding Here's an example of one of these errors: ``` INVALID_KEY predicate py_cobjectnames(@py_cobject obj, string name) The key set {obj} does not functionally determine all fields. Here is a pair of tuples that agree on the key set but differ at index 1: Tuple 1 in row 63874: (72088,"u''") Tuple 2 in row 63875: (72088,"u''") ``` (Here, the substring `X` should really be the Unicode character U+FFFD, but for some reason I'm not allowed to put that in this commit message.) Inside the extractor, we assign IDs based on the string type (bytestring or Unicode) and a hash of the UTF-8 encoded content of the string. In this case, however, certain _different_ strings were receiving the same hash, due to replacement characters in the encoding process. In particular, we were converting unencodable characters to question marks in one place, and to U+FFFD in another place. This caused a discrepancy that lead to the dataset check error. To fix this, we put in a custom error handler that always puts the U+FFFD character in place of unencodable characters. With this, the strings now agree, and hence there is no clash. --- .../extractor/semmle/python/passes/objects.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/python/extractor/semmle/python/passes/objects.py b/python/extractor/semmle/python/passes/objects.py index 599539bc5419..335603c131d7 100644 --- a/python/extractor/semmle/python/passes/objects.py +++ b/python/extractor/semmle/python/passes/objects.py @@ -43,6 +43,23 @@ LITERALS = (ast.Num, ast.Str) +# A variant of the 'replace' error handler that replaces unencodable characters with U+FFFD +# rather than '?'. Without this, a string like '\uD800' (which is not encodable) would get mapped +# to '?', and potentially clash with the regular string '?' if it appeared elsewhere in the source +# code. Used in 'get_label_for_object' below. Based on code from https://peps.python.org/pep-0293/ +def fffd_replace(exc): + if isinstance(exc, UnicodeEncodeError): + return ((exc.end-exc.start)*u"\\ufffd", exc.end) + elif isinstance(exc, UnicodeDecodeError): + return (u"\\ufffd", exc.end) + elif isinstance(exc, UnicodeTranslateError): + return ((exc.end-exc.start)*u"\\ufffd", exc.end) + else: + raise TypeError("can't handle %s" % exc.__name__) + +import codecs +codecs.register_error("fffdreplace", fffd_replace) + class _CObject(object): '''Utility class to wrap arbitrary C objects. Treat all objects as unique. Rely on naming in the @@ -239,7 +256,7 @@ def get_label_for_object(self, obj, default_label, obj_type): else: prefix = u"C_bytes$" if t is str: - obj = obj.encode("utf8", errors='replace') + obj = obj.encode("utf8", errors='fffdreplace') return prefix + hashlib.sha1(obj).hexdigest() if t is bytes: return prefix + hashlib.sha1(obj).hexdigest() From ae4a4bb881d95371e58d9f9eb4da31d13fdbd41d Mon Sep 17 00:00:00 2001 From: Taus Date: Mon, 21 Oct 2024 15:32:23 +0000 Subject: [PATCH 3/3] Python: Flip test expectation This test should now validate that we no longer have dataset check errors even when there are unencodable characters. --- python/extractor/cli-integration-test/string-encoding/test.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/extractor/cli-integration-test/string-encoding/test.sh b/python/extractor/cli-integration-test/string-encoding/test.sh index a8cf5afb824b..3bf1a6b03014 100755 --- a/python/extractor/cli-integration-test/string-encoding/test.sh +++ b/python/extractor/cli-integration-test/string-encoding/test.sh @@ -13,7 +13,6 @@ rm -rf db $CODEQL database create db --language python --source-root repo_dir/ -# Note the negation in front -- it witnesses the fact that currently the dataset check FAILS. -! $CODEQL dataset check db/db-python +$CODEQL dataset check db/db-python echo "Test successfully completed."