From d01593e571264e02f3bfc66184cf868bbc3cfca0 Mon Sep 17 00:00:00 2001
From: Taus <tausbn@github.com>
Date: Fri, 18 Oct 2024 14:59:58 +0000
Subject: [PATCH 1/3] Python: Add test for string encoding dataset check

Note that this test checks that the current setup creates dataset check
violations. A later commit will fix this (and flip the negation in the
test).
---
 .../string-encoding/repo_dir/test.py          |  2 ++
 .../string-encoding/test.sh                   | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 python/extractor/cli-integration-test/string-encoding/repo_dir/test.py
 create mode 100755 python/extractor/cli-integration-test/string-encoding/test.sh
diff --git a/python/extractor/cli-integration-test/string-encoding/repo_dir/test.py b/python/extractor/cli-integration-test/string-encoding/repo_dir/test.py
new file mode 100644
index 000000000000..8e7efcaf9260
--- /dev/null
+++ b/python/extractor/cli-integration-test/string-encoding/repo_dir/test.py
@@ -0,0 +1,2 @@
+"\uD800"
+"?"
diff --git a/python/extractor/cli-integration-test/string-encoding/test.sh b/python/extractor/cli-integration-test/string-encoding/test.sh
new file mode 100755
index 000000000000..a8cf5afb824b
--- /dev/null
+++ b/python/extractor/cli-integration-test/string-encoding/test.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+rm -rf db
+
+$CODEQL database create db --language python --source-root repo_dir/
+
+# Note the negation in front -- it witnesses the fact that currently the dataset check FAILS.
+! $CODEQL dataset check db/db-python
+
+echo "Test successfully completed."

From cc39ae57dc67a3bbc534346fd783ac01d011e49e Mon Sep 17 00:00:00 2001
From: Taus <tausbn@github.com>
Date: Mon, 21 Oct 2024 15:31:16 +0000
Subject: [PATCH 2/3] Python: Fix dataset check error for string encoding

Here's an example of one of these errors:
```
INVALID_KEY predicate py_cobjectnames(@py_cobject obj, string name)

The key set {obj} does not functionally determine all fields. Here is a
pair of tuples that agree on the key set but differ at index 1: Tuple 1
in row 63874: (72088,"u'<X>'") Tuple 2 in row 63875: (72088,"u'<?>'")
```
(Here, the substring `X` should really be the Unicode character U+FFFD,
but for some reason I'm not allowed to put that in this commit message.)

Inside the extractor, we assign IDs based on the string type (bytestring
or Unicode) and a hash of the UTF-8 encoded content of the string. In
this case, however, certain _different_ strings were receiving the same
hash, due to replacement characters in the encoding process.

In particular, we were converting unencodable characters to question
marks in one place, and to U+FFFD in another place. This caused a
discrepancy that lead to the dataset check error.

To fix this, we put in a custom error handler that always puts the
U+FFFD character in place of unencodable characters. With this, the
strings now agree, and hence there is no clash.
---
 .../extractor/semmle/python/passes/objects.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/python/extractor/semmle/python/passes/objects.py b/python/extractor/semmle/python/passes/objects.py
index 599539bc5419..335603c131d7 100644
--- a/python/extractor/semmle/python/passes/objects.py
+++ b/python/extractor/semmle/python/passes/objects.py
@@ -43,6 +43,23 @@
 
 LITERALS = (ast.Num, ast.Str)
 
+# A variant of the 'replace' error handler that replaces unencodable characters with U+FFFD
+# rather than '?'. Without this, a string like '\uD800' (which is not encodable) would get mapped
+# to '?', and potentially clash with the regular string '?' if it appeared elsewhere in the source
+# code. Used in 'get_label_for_object' below. Based on code from https://peps.python.org/pep-0293/
+def fffd_replace(exc):
+     if isinstance(exc, UnicodeEncodeError):
+         return ((exc.end-exc.start)*u"\\ufffd", exc.end)
+     elif isinstance(exc, UnicodeDecodeError):
+         return (u"\\ufffd", exc.end)
+     elif isinstance(exc, UnicodeTranslateError):
+         return ((exc.end-exc.start)*u"\\ufffd", exc.end)
+     else:
+         raise TypeError("can't handle %s" % exc.__name__)
+
+import codecs
+codecs.register_error("fffdreplace", fffd_replace)
+
 class _CObject(object):
     '''Utility class to wrap arbitrary C objects.
     Treat all objects as unique. Rely on naming in the
@@ -239,7 +256,7 @@ def get_label_for_object(self, obj, default_label, obj_type):
         else:
             prefix = u"C_bytes$"
         if t is str:
-            obj = obj.encode("utf8", errors='replace')
+            obj = obj.encode("utf8", errors='fffdreplace')
             return prefix + hashlib.sha1(obj).hexdigest()
         if t is bytes:
             return prefix + hashlib.sha1(obj).hexdigest()

From ae4a4bb881d95371e58d9f9eb4da31d13fdbd41d Mon Sep 17 00:00:00 2001
From: Taus <tausbn@github.com>
Date: Mon, 21 Oct 2024 15:32:23 +0000
Subject: [PATCH 3/3] Python: Flip test expectation

This test should now validate that we no longer have dataset check
errors even when there are unencodable characters.
---
 python/extractor/cli-integration-test/string-encoding/test.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/extractor/cli-integration-test/string-encoding/test.sh b/python/extractor/cli-integration-test/string-encoding/test.sh
index a8cf5afb824b..3bf1a6b03014 100755
--- a/python/extractor/cli-integration-test/string-encoding/test.sh
+++ b/python/extractor/cli-integration-test/string-encoding/test.sh
@@ -13,7 +13,6 @@ rm -rf db
 
 $CODEQL database create db --language python --source-root repo_dir/
 
-# Note the negation in front -- it witnesses the fact that currently the dataset check FAILS.
-! $CODEQL dataset check db/db-python
+$CODEQL dataset check db/db-python
 
 echo "Test successfully completed."