From 1427c07b8d49904d00caf1766861c54f89c66d27 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 17 Jan 2024 13:02:17 +0100 Subject: [PATCH] utils.generate_range: raise a ValueError if non-numeric parts differ --- ocrd_utils/ocrd_utils/str.py | 2 ++ tests/cli/test_workspace.py | 1 + tests/model/test_ocrd_mets.py | 3 ++- tests/test_utils.py | 10 ++++++++-- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py index f5b9242d35..cf5cd3778e 100644 --- a/ocrd_utils/ocrd_utils/str.py +++ b/ocrd_utils/ocrd_utils/str.py @@ -204,6 +204,8 @@ def generate_range(start, end): start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1] except IndexError: raise ValueError("Range '%s..%s': could not find numeric part" % (start, end)) + if start[:-len(start_num)] != end[:-len(end_num)]: + raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: '{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'") if start_num == end_num: warn("Range '%s..%s': evaluates to the same number") for i in range(int(start_num), int(end_num) + 1): diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index e26a6c2fb0..b155811ac8 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -563,6 +563,7 @@ def _call(args): assert _call(['-f', 'json']) == '[[["PHYS_0001"], ["PHYS_0002"], ["PHYS_0003"], ["PHYS_0004"], ["PHYS_0005"], ["PHYS_0006"], ["PHYS_0008"], ["PHYS_0009"], ["PHYS_0010"], ["PHYS_0011"], ["PHYS_0012"], ["PHYS_0013"], ["PHYS_0014"], ["PHYS_0015"], ["PHYS_0016"], ["PHYS_0017"], ["PHYS_0018"], ["PHYS_0019"], ["PHYS_0020"], ["PHYS_0022"], ["PHYS_0023"], ["PHYS_0024"], ["PHYS_0025"], ["PHYS_0026"], ["PHYS_0027"], ["PHYS_0028"], ["PHYS_0029"]]]' assert _call(['-f', 'comma-separated', '-R', '5..5']) == 'PHYS_0005' assert _call(['-f', 'comma-separated', '-R', '6..8']) == 'PHYS_0006,PHYS_0008,PHYS_0009' + assert _call(['-f', 'comma-separated', '-r', '1..5']) == 'PHYS_0001,PHYS_0002,PHYS_0003,PHYS_0004,PHYS_0005' assert _call(['-f', 'comma-separated', '-r', 'PHYS_0006..PHYS_0009']) == 'PHYS_0006,PHYS_0008,PHYS_0009' assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3']) == 'PHYS_0001,PHYS_0002,PHYS_0003\nPHYS_0004,PHYS_0005,PHYS_0006\nPHYS_0008,PHYS_0009,PHYS_0010' assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2']) == 'PHYS_0008,PHYS_0009,PHYS_0010' diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 15636e67fe..631f61d0d5 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -91,7 +91,8 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' - assert len(sbb_sample_01.find_all_files(pageId='0..100')) == 35, '35 files in @ORDER range 1..10' + assert len(sbb_sample_01.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10' + assert len(sbb_sample_01.find_all_files(pageId='1..PHYS_0002')) == 35, '35 files in @ORDER range 1..10' def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', diff --git a/tests/test_utils.py b/tests/test_utils.py index d2093c465d..2dac359c46 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -295,9 +295,15 @@ def test_make_file_id_744(): def test_generate_range(): assert generate_range('PHYS_0001', 'PHYS_0005') == ['PHYS_0001', 'PHYS_0002', 'PHYS_0003', 'PHYS_0004', 'PHYS_0005'] with raises(ValueError, match='could not find numeric part'): - generate_range('NONUMBER', 'ALSO_NONUMBER') + assert generate_range('NONUMBER', 'ALSO_NONUMBER') + with raises(ValueError, match='differ in their non-numeric part'): + generate_range('PHYS_0001_123', 'PHYS_0010_123') + with raises(ValueError, match='differ in their non-numeric part'): + assert generate_range('1', 'PHYS_0005') == 0 + with raises(ValueError, match='differ in their non-numeric part'): + assert generate_range('1', 'page 5') == 0 with warns(UserWarning, match='same number'): - generate_range('PHYS_0001_123', 'PHYS_0010_123') == 'PHYS_0001_123' + assert generate_range('PHYS_0001_123', 'PHYS_0001_123') == ['PHYS_0001_123'] def test_safe_filename(): assert safe_filename('Hello world,!') == 'Hello_world_'