Skip to content

Commit

Permalink
Merge pull request #17807 from github/tausbn/python-fix-string-encodi…
Browse files Browse the repository at this point in the history
…ng-dataset-check-failure

Python: Fix string encoding dataset check failure
  • Loading branch information
tausbn authored Oct 23, 2024
2 parents 197642c + ae4a4bb commit e1e3568
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"\uD800"
"?"
18 changes: 18 additions & 0 deletions python/extractor/cli-integration-test/string-encoding/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/

set -x

CODEQL=${CODEQL:-codeql}

SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd "$SCRIPTDIR"

rm -rf db

$CODEQL database create db --language python --source-root repo_dir/

$CODEQL dataset check db/db-python

echo "Test successfully completed."
19 changes: 18 additions & 1 deletion python/extractor/semmle/python/passes/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,23 @@

LITERALS = (ast.Num, ast.Str)

# A variant of the 'replace' error handler that replaces unencodable characters with U+FFFD
# rather than '?'. Without this, a string like '\uD800' (which is not encodable) would get mapped
# to '?', and potentially clash with the regular string '?' if it appeared elsewhere in the source
# code. Used in 'get_label_for_object' below. Based on code from https://peps.python.org/pep-0293/
def fffd_replace(exc):
if isinstance(exc, UnicodeEncodeError):
return ((exc.end-exc.start)*u"\\ufffd", exc.end)
elif isinstance(exc, UnicodeDecodeError):
return (u"\\ufffd", exc.end)
elif isinstance(exc, UnicodeTranslateError):
return ((exc.end-exc.start)*u"\\ufffd", exc.end)
else:
raise TypeError("can't handle %s" % exc.__name__)

import codecs
codecs.register_error("fffdreplace", fffd_replace)

class _CObject(object):
'''Utility class to wrap arbitrary C objects.
Treat all objects as unique. Rely on naming in the
Expand Down Expand Up @@ -239,7 +256,7 @@ def get_label_for_object(self, obj, default_label, obj_type):
else:
prefix = u"C_bytes$"
if t is str:
obj = obj.encode("utf8", errors='replace')
obj = obj.encode("utf8", errors='fffdreplace')
return prefix + hashlib.sha1(obj).hexdigest()
if t is bytes:
return prefix + hashlib.sha1(obj).hexdigest()
Expand Down

0 comments on commit e1e3568

Please sign in to comment.