diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index c7cc63bcfe..22af663ff5 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -227,12 +227,12 @@ def init_chardet(self) -> None: self.encdetector = UniversalDetector() - def open(self, filename: str) -> Tuple[List[str], str]: + def open(self, filename: str) -> Tuple[List[str], Optional[str]]: if self.use_chardet: return self.open_with_chardet(filename) return self.open_with_internal(filename) - def open_with_chardet(self, filename: str) -> Tuple[List[str], str]: + def open_with_chardet(self, filename: str) -> Tuple[List[str], Optional[str]]: self.encdetector.reset() with open(filename, "rb") as fb: for line in fb: @@ -241,26 +241,26 @@ def open_with_chardet(self, filename: str) -> Tuple[List[str], str]: break self.encdetector.close() encoding = self.encdetector.result["encoding"] - - try: - f = open(filename, encoding=encoding, newline="") - except UnicodeDecodeError: - print(f"ERROR: Could not detect encoding: {filename}", file=sys.stderr) - raise - except LookupError: + if not encoding: print( - f"ERROR: Don't know how to handle encoding {encoding}: {filename}", + f"WARNING: Chardet failed to detect encoding for file {filename}.", file=sys.stderr, ) + try: + with open(filename, encoding=encoding, newline="") as f: + lines = self.get_lines(f) + except LookupError: # Raised by open() if encoding is unknown + error_msg = f"ERROR: Chardet returned unknown encoding for: {filename}." + print(error_msg, file=sys.stderr) + raise + except UnicodeDecodeError: # Raised by self.get_lines() if decoding fails + error_msg = f"ERROR: Failed decoding file: {filename}" + print(error_msg, file=sys.stderr) raise - else: - lines = self.get_lines(f) - f.close() - return lines, f.encoding + return lines, encoding def open_with_internal(self, filename: str) -> Tuple[List[str], str]: - encoding = None first_try = True for encoding in ("utf-8", "iso-8859-1"): if first_try: @@ -869,7 +869,7 @@ def apply_uri_ignore_words( return check_matches -def parse_file( +def parse_file( # noqa: PLR0915 filename: str, colors: TermColors, summary: Optional[Summary], @@ -887,6 +887,7 @@ def parse_file( bad_count = 0 lines = None changed = False + encoding: Optional[str] if filename == "-": f = sys.stdin diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 74e10404e1..eace30ab62 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -569,14 +569,51 @@ def test_encoding( assert "WARNING: Binary file" in stderr -def test_unknown_encoding_chardet( +def test_chardet_exceptions( tmp_path: Path, capsys: pytest.CaptureFixture[str], ) -> None: - """Test opening a file with unknown encoding using chardet""" + """Test encoding handling with chardet exceptions.""" fname = tmp_path / "tmp" - fname.touch() - assert cs.main("--hard-encoding-detection", fname) == 0 + fname.write_bytes("naïve\n".encode()) + with mock.patch( + "chardet.universaldetector.UniversalDetector" + ) as mock_detector_class: + # Configure the mock to simulate an incorrect encoding detection + mock_detector = mock.MagicMock() + mock_detector.result = {"encoding": None} + mock_detector.done = True + mock_detector_class.return_value = mock_detector + + # Simulate chardet not detecting any encoding + result = cs.main("-e", fname, std=True, count=False) + assert isinstance(result, tuple) + code, stdout, stderr = result + assert code == 0 + assert not stdout + assert "WARNING: Chardet failed to detect encoding" in stderr + assert str(fname) in stderr + + # Simulate chardet falsely detecting utf-8, instead of the correct iso-8859-1 + mock_detector.result = {"encoding": "utf-8"} # Simulate wrong encoding detected + mock_detector_class.return_value = mock_detector + fname.write_bytes(b"Speling error, non-ASCII: h\xe9t\xe9rog\xe9n\xe9it\xe9\n") + with pytest.raises(UnicodeDecodeError) as exc_info_unicode: + cs.main("-e", fname, std=True, count=False) + stderr = capsys.readouterr().err + assert "ERROR: Failed decoding file:" in stderr + assert str(fname) in stderr + assert "utf-8" in str(exc_info_unicode.value) + + # Simulate chardet detecting non-existent encoding + mock_detector.result = {"encoding": "UTF-doesnotexist"} + mock_detector_class.return_value = mock_detector + with pytest.raises(LookupError) as exc_info_lookup: + cs.main("-e", fname, std=True, count=False) + stderr = capsys.readouterr().err + assert "ERROR: Chardet returned unknown encoding" in stderr + assert str(fname) in stderr + assert "UTF-doesnotexist" in str(exc_info_lookup.value) def test_ignore(