diff --git a/src/natlinkcore/readwritefile.py b/src/natlinkcore/readwritefile.py index d34ed0b..0ff4415 100644 --- a/src/natlinkcore/readwritefile.py +++ b/src/natlinkcore/readwritefile.py @@ -38,8 +38,6 @@ class ReadWriteFile: `encodings` and `encoding` can be overridden at creation of an instance. `encodings` must then be a list of possible encodings - `encoding` is then - when `encoding` is a str, `encodings` is set to a list only containing this encoding the default `encodings` are: `['ascii', 'utf-8', 'cp1252', 'latin-1']` @@ -47,6 +45,9 @@ class ReadWriteFile: When the encoding is 'ascii' and at write time, non ascii characters are present, care is taken to encode the output to another encoding, most often (default) 'utf-8'. + + When you need a 'utf-16le' encoding (for 'nsapps.ini' of Dragon), + pass `encodings = ['utf16-le']` when creating the instance. """ def __init__(self, encodings=None): self.input_path = '' diff --git a/tests/mock_readwritefile/cp1252.txt b/tests/mock_readwritefile/cp1252.txt index 8d169a0..78617b2 100644 --- a/tests/mock_readwritefile/cp1252.txt +++ b/tests/mock_readwritefile/cp1252.txt @@ -1 +1 @@ -cp1252 café +cp1252 euro: € \ No newline at end of file diff --git a/tests/mock_readwritefile/latin1 accented.txt b/tests/mock_readwritefile/latin1.txt similarity index 100% rename from tests/mock_readwritefile/latin1 accented.txt rename to tests/mock_readwritefile/latin1.txt diff --git a/tests/test_readwritefile.py b/tests/test_readwritefile.py index ed4879b..74e69e6 100644 --- a/tests/test_readwritefile.py +++ b/tests/test_readwritefile.py @@ -52,7 +52,7 @@ def test_accented_characters_write_file(tmp_path): # newFile = join(testDir, 'output-accented.txt') testDir = tmp_path / testFolderName testDir.mkdir() - newFile = testDir/"outut-accented.txt" + newFile = testDir/"output-accented.txt" text = 'caf\xe9' rwfile = ReadWriteFile(encodings=['ascii']) # optional encoding # this is with default errors='xmlcharrefreplace': @@ -89,7 +89,7 @@ def test_other_encodings_write_file(tmp_path): testDir = tmp_path / testFolderName testDir.mkdir() - oldFile = mock_readwritefiledir/'latin1 accented.txt' + oldFile = mock_readwritefiledir/'latin1.txt' rwfile = ReadWriteFile(encodings=['latin1']) # optional encoding text = rwfile.readAnything(oldFile) @@ -133,11 +133,32 @@ def test_nsapps_utf16(tmp_path): assert encoding2 == 'utf-16le' def test_latin1_cp1252_write_file(tmp_path): + """have one latin-1 file and one that is specific cp1252 (euro sign) + + Currently both return cp1252, as is is hard to distinguish them and cp1252 is more general + """ testDir = tmp_path / testFolderName testDir.mkdir() - _newFile = testDir/ 'latin1.txt' - _newFile = testDir/'cp1252.txt' - assert False, "QH TODO" + mock_files_list = os.listdir(mock_readwritefiledir) + + assert 'latin1.txt' in mock_files_list + assert 'cp1252.txt' in mock_files_list + + rwfilelatin1 = ReadWriteFile() + rwfilecp1252 = ReadWriteFile() + latin1_path = mock_readwritefiledir/'latin1.txt' + cp1252_path = mock_readwritefiledir/'cp1252.txt' + + rwfilelatin1.readAnything(latin1_path) + + assert rwfilelatin1.bom == '' + assert rwfilelatin1.encoding == 'cp1252' + + rwfilecp1252.readAnything(cp1252_path) + assert rwfilecp1252.bom == '' + assert rwfilecp1252.encoding == 'cp1252' + + # TODO (QH) to be done, these encodings do not take all characters, # and need special attention. @@ -151,11 +172,15 @@ def test_read_write_file(tmp_path): assert len(mock_files_list) > 0 for F in mock_files_list: + encodings = None + if F.startswith('nsapps'): + encodings = ['utf-16le'] + continue # utf16-le is not caught by the standard function, but needs its own encoding if not F.startswith('output-'): Fout = 'output-' + F #read the file from the mock folder F_path = mock_readwritefiledir / F - rwfile = ReadWriteFile() + rwfile = ReadWriteFile(encodings=encodings) text = rwfile.readAnything(F_path) trunk, _ext = splitext(F) Fout = trunk + ".txt" @@ -172,13 +197,42 @@ def test_read_write_file(tmp_path): raise ValueError(f'old: "{F_path}", new: "{Fout_path}", differ at pos {i}: Old: "{o}", new: "{n}", partold (i:i+2): "{parto}", partnew: "{partn}"') def test_acoustics_ini(tmp_path): + """this is a utf-8 file with a bom mark. Try also writing back! + """ + testDir = tmp_path / testFolderName + testDir.mkdir() + + F='acoustic.ini' F_path = mock_readwritefiledir/F rwfile = ReadWriteFile() config_text = rwfile.readAnything(F_path) Config = configparser.ConfigParser() Config.read_string(config_text) - assert Config.get('Acoustics', '2 2') == '2_2' + assert Config.get('Acoustics', '2 2') == '2_2' + + newFile1 = 'output1' + F + newPath1 = testDir/newFile1 + rwfile.writeAnything(newPath1, config_text) + + assert filecmp.cmp(F_path, newPath1) + + rwfile2 = ReadWriteFile() + text2 = rwfile2.readAnything(newPath1) + bom2 = rwfile2.bom + encoding2 = rwfile2.encoding + + tRaw = rwfile.rawText + tRaw2 = rwfile2.rawText + + assert tRaw2 == tRaw + assert text2[0:5] == '[Base' + assert bom2 == [239, 187, 191] + assert encoding2 == 'utf-8' + + + + @pytest.mark.parametrize("F", ['originalnatlink.ini', 'natlinkconfigured.ini']) def test_config_ini(tmp_path,F):