Merge pull request #17807 from github/tausbn/python-fix-string-encodi…

…ng-dataset-check-failure Python: Fix string encoding dataset check failure
github · Oct 23, 2024 · e1e3568 · e1e3568
2 parents 197642c + ae4a4bb
commit e1e3568
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 1 deletion.
diff --git a/python/extractor/cli-integration-test/string-encoding/repo_dir/test.py b/python/extractor/cli-integration-test/string-encoding/repo_dir/test.py
@@ -0,0 +1,2 @@
+"\uD800"
+"?"
diff --git a/python/extractor/cli-integration-test/string-encoding/test.sh b/python/extractor/cli-integration-test/string-encoding/test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
+
+set -x
+
+CODEQL=${CODEQL:-codeql}
+
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$SCRIPTDIR"
+
+rm -rf db
+
+$CODEQL database create db --language python --source-root repo_dir/
+
+$CODEQL dataset check db/db-python
+
+echo "Test successfully completed."
diff --git a/python/extractor/semmle/python/passes/objects.py b/python/extractor/semmle/python/passes/objects.py
@@ -43,6 +43,23 @@
 
 LITERALS = (ast.Num, ast.Str)
 
+# A variant of the 'replace' error handler that replaces unencodable characters with U+FFFD
+# rather than '?'. Without this, a string like '\uD800' (which is not encodable) would get mapped
+# to '?', and potentially clash with the regular string '?' if it appeared elsewhere in the source
+# code. Used in 'get_label_for_object' below. Based on code from https://peps.python.org/pep-0293/
+def fffd_replace(exc):
+     if isinstance(exc, UnicodeEncodeError):
+         return ((exc.end-exc.start)*u"\\ufffd", exc.end)
+     elif isinstance(exc, UnicodeDecodeError):
+         return (u"\\ufffd", exc.end)
+     elif isinstance(exc, UnicodeTranslateError):
+         return ((exc.end-exc.start)*u"\\ufffd", exc.end)
+     else:
+         raise TypeError("can't handle %s" % exc.__name__)
+
+import codecs
+codecs.register_error("fffdreplace", fffd_replace)
+
 class _CObject(object):
     '''Utility class to wrap arbitrary C objects.
     Treat all objects as unique. Rely on naming in the
@@ -239,7 +256,7 @@ def get_label_for_object(self, obj, default_label, obj_type):
         else:
             prefix = u"C_bytes$"
         if t is str:
-            obj = obj.encode("utf8", errors='replace')
+            obj = obj.encode("utf8", errors='fffdreplace')
             return prefix + hashlib.sha1(obj).hexdigest()
         if t is bytes:
             return prefix + hashlib.sha1(obj).hexdigest()