diff --git a/bench/__init__.py b/bench/__init__.py index 7a77a86..139759b 100644 --- a/bench/__init__.py +++ b/bench/__init__.py @@ -1,2 +1,2 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import \ No newline at end of file +from __future__ import absolute_import diff --git a/bench/speed.py b/bench/speed.py index c795b58..886d950 100755 --- a/bench/speed.py +++ b/bench/speed.py @@ -7,37 +7,37 @@ import os import zipfile import struct -#import pstats -#import cProfile + +# import pstats +# import cProfile import dawg + def words100k(): - zip_name = os.path.join( - os.path.abspath(os.path.dirname(__file__)), - 'words100k.txt.zip' - ) + zip_name = os.path.join(os.path.abspath(os.path.dirname(__file__)), "words100k.txt.zip") zf = zipfile.ZipFile(zip_name) - txt = zf.open(zf.namelist()[0]).read().decode('utf8') + txt = zf.open(zf.namelist()[0]).read().decode("utf8") return txt.splitlines() + def random_words(num): - russian = 'абвгдеёжзиклмнопрстуфхцчъыьэюя' - alphabet = '%s%s' % (russian, string.ascii_letters) - return [ - "".join([random.choice(alphabet) for x in range(random.randint(1,15))]) - for y in range(num) - ] + russian = "абвгдеёжзиклмнопрстуфхцчъыьэюя" + alphabet = "%s%s" % (russian, string.ascii_letters) + return ["".join([random.choice(alphabet) for x in range(random.randint(1, 15))]) for y in range(num)] + def truncated_words(words): return [word[:3] for word in words] + def prefixes1k(words, prefix_len): words = [w for w in words if len(w) >= prefix_len] - every_nth = int(len(words)/1000) + every_nth = int(len(words) / 1000) _words = [w[:prefix_len] for w in words[::every_nth]] return _words[:1000] + def leet_words(words, replaces): for key, value in replaces.items(): words = [w.replace(key, value) for w in words] @@ -53,75 +53,104 @@ def leet_words(words, replaces): PREFIXES_15_1k = prefixes1k(WORDS100k, 15) LEET_REPLACES = { - 'o': '0', - 'O': '0', - 'u': '0', - 'l': '1', - 'i': '1', - 'e': '3', - 'E': '3', - 'A': '4', - 'a': '4', - 'h': '4', - 's': 'z', + "o": "0", + "O": "0", + "u": "0", + "l": "1", + "i": "1", + "e": "3", + "E": "3", + "A": "4", + "a": "4", + "h": "4", + "s": "z", } LEET_50k = leet_words(WORDS100k[:50000], LEET_REPLACES) + def format_result(key, value, text_width): key = key.ljust(text_width) print(" %s %s" % (key, value)) -def bench(name, timer, descr='M ops/sec', op_count=0.1, repeats=3, runs=5, - text_width=33): +def bench(name, timer, descr="M ops/sec", op_count=0.1, repeats=3, runs=5, text_width=33): try: times = [] for x in range(runs): times.append(timer.timeit(repeats)) def op_time(time): - return op_count*repeats / time + return op_count * repeats / time val = "%0.3f%s" % (op_time(min(times)), descr) format_result(name, val, text_width) except (AttributeError, TypeError) as e: format_result(name, "not supported", text_width) + def create_dawg(): words = words100k() return dawg.DAWG(words) + def create_bytes_dawg(): words = words100k() - values = [struct.pack(str(' v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'DAWGdoc' +htmlhelp_basename = "DAWGdoc" # -- Options for LaTeX output -------------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'DAWG.tex', 'DAWG Documentation', - 'Mikhail Korobov', 'manual'), + ("index", "DAWG.tex", "DAWG Documentation", "Mikhail Korobov", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'dawg', 'DAWG Documentation', - ['Mikhail Korobov'], 1) -] +man_pages = [("index", "dawg", "DAWG Documentation", ["Mikhail Korobov"], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------------ @@ -228,16 +222,22 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'DAWG', 'DAWG Documentation', - 'Mikhail Korobov', 'DAWG', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "DAWG", + "DAWG Documentation", + "Mikhail Korobov", + "DAWG", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' diff --git a/pyproject.toml b/pyproject.toml index cbc600f..76ed89e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,3 +20,6 @@ exclude_lines = [ "@(abc\\.)?abstractmethod", ] include = ["src/*"] + +[tool.black] +line-length = 120 diff --git a/setup.py b/setup.py index 23acf60..dc8de71 100755 --- a/setup.py +++ b/setup.py @@ -17,8 +17,8 @@ extensions = [ Extension( "dawg", - sources=glob.glob('src/*.pyx') + glob.glob('lib/b64/*.c'), - include_dirs=['lib'], + sources=glob.glob("src/*.pyx") + glob.glob("lib/b64/*.c"), + include_dirs=["lib"], language="c++", define_macros=define_macros, ) @@ -35,28 +35,26 @@ name="DAWG2", version="0.9.1", description="Fast and memory efficient DAWG (DAFSA) for Python", - long_description=open('README.rst').read() + '\n\n' + open('CHANGES.rst').read(), - author='Mikhail Korobov', - author_email='kmike84@gmail.com', - url='https://github.com/pymorphy2-fork/DAWG/', - + long_description=open("README.rst").read() + "\n\n" + open("CHANGES.rst").read(), + author="Mikhail Korobov", + author_email="kmike84@gmail.com", + url="https://github.com/pymorphy2-fork/DAWG/", ext_modules=ext_modules, - classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Cython', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: Implementation :: CPython', - 'Topic :: Software Development :: Libraries :: Python Modules', - 'Topic :: Scientific/Engineering :: Information Analysis', - 'Topic :: Text Processing :: Linguistic', + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Cython", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Text Processing :: Linguistic", ], ) diff --git a/tests/__init__.py b/tests/__init__.py index 7a77a86..139759b 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,2 +1,2 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import \ No newline at end of file +from __future__ import absolute_import diff --git a/tests/test_dawg.py b/tests/test_dawg.py index 26f6627..3249925 100644 --- a/tests/test_dawg.py +++ b/tests/test_dawg.py @@ -7,25 +7,25 @@ import pytest import dawg + def test_contains(): - d = dawg.IntDAWG({'foo': 1, 'bar': 2, 'foobar': 3}) + d = dawg.IntDAWG({"foo": 1, "bar": 2, "foobar": 3}) - assert 'foo' in d - assert 'bar' in d - assert 'foobar' in d - assert 'fo' not in d - assert 'x' not in d + assert "foo" in d + assert "bar" in d + assert "foobar" in d + assert "fo" not in d + assert "x" not in d - assert b'foo' in d - assert b'x' not in d + assert b"foo" in d + assert b"x" not in d class TestDAWG(object): - def test_sorted_iterable(self): - sorted_data = ['bar', 'foo', 'foobar'] - contents = "\n".join(sorted_data).encode('utf8') + sorted_data = ["bar", "foo", "foobar"] + contents = "\n".join(sorted_data).encode("utf8") with tempfile.NamedTemporaryFile() as f: f.write(contents) f.seek(0) @@ -33,23 +33,23 @@ def test_sorted_iterable(self): words = (line.strip() for line in f) d = dawg.DAWG(words, input_is_sorted=True) - assert 'bar' in d - assert 'foo' in d + assert "bar" in d + assert "foo" in d def test_no_segfaults_on_invalid_file(self): d = dawg.DAWG() fd, path = tempfile.mkstemp() - with open(path, 'w') as f: - f.write('foo') + with open(path, "w") as f: + f.write("foo") with pytest.raises(IOError) as e: d.load(path) - assert 'Invalid' in e.args[0] + assert "Invalid" in e.args[0] - with open(path, 'rb') as f: + with open(path, "rb") as f: with pytest.raises(IOError) as e: d.read(f) - assert 'Invalid' in e.args[0] + assert "Invalid" in e.args[0] def test_no_segfaults_after_wrong_stream(self): d = dawg.DAWG() @@ -58,21 +58,21 @@ def test_no_segfaults_after_wrong_stream(self): with pytest.raises(IOError): d.load(wrong_path) - assert 'random-key' not in d # there is possible segfault + assert "random-key" not in d # there is possible segfault def test_build_errors(self): with pytest.raises(dawg.Error): - data = [b'foo\x00bar', b'bar'] + data = [b"foo\x00bar", b"bar"] dawg.DAWG(data) def test_contains_with_null_bytes(self): - d = dawg.DAWG(['foo']) - assert b'foo' in d - assert b'foo\x00bar' not in d + d = dawg.DAWG(["foo"]) + assert b"foo" in d + assert b"foo\x00bar" not in d def test_unicode_sorting(self): - key1 = '\U00010345\U0001033f\U00010337\U00010330\U0001033d' - key2 = '\uff72\uff9c\uff90\uff7b\uff9e\uff9c' + key1 = "\U00010345\U0001033f\U00010337\U00010330\U0001033d" + key2 = "\uff72\uff9c\uff90\uff7b\uff9e\uff9c" # This apparently depends on Python version: # assert key1 < key2 @@ -84,13 +84,12 @@ def test_unicode_sorting(self): dawg.DAWG([key1, key2]) - class TestIntDAWG(object): IntDAWG = dawg.IntDAWG def dawg(self): - payload = {'foo': 1, 'bar': 5, 'foobar': 3} + payload = {"foo": 1, "bar": 5, "foobar": 3} d = self.IntDAWG(payload) return payload, d @@ -100,8 +99,7 @@ def test_getitem(self): assert d[key] == payload[key] with pytest.raises(KeyError): - d['fo'] - + d["fo"] def test_dumps_loads(self): payload, d = self.dawg() @@ -138,15 +136,15 @@ def test_pickling(self): assert d[key] == value def test_int_value_ranges(self): - for val in [0, 5, 2**16-1, 2**31-1]: - d = self.IntDAWG({'f': val}) - assert d['f'] == val + for val in [0, 5, 2**16 - 1, 2**31 - 1]: + d = self.IntDAWG({"f": val}) + assert d["f"] == val with pytest.raises(ValueError): - self.IntDAWG({'f': -1}) + self.IntDAWG({"f": -1}) with pytest.raises(OverflowError): - self.IntDAWG({'f': 2**32-1}) + self.IntDAWG({"f": 2**32 - 1}) class TestIntCompletionDAWG(TestIntDAWG): @@ -154,7 +152,7 @@ class TestIntCompletionDAWG(TestIntDAWG): class TestCompletionDAWG(object): - keys = ['f', 'bar', 'foo', 'foobar'] + keys = ["f", "bar", "foo", "foobar"] def dawg(self): return dawg.CompletionDAWG(self.keys) @@ -197,18 +195,18 @@ def test_iterprefixes(self): def test_completion(self): d = self.dawg() - assert d.keys('z') == [] - assert d.keys('b') == ['bar'] - assert d.keys('foo') == ['foo', 'foobar'] + assert d.keys("z") == [] + assert d.keys("b") == ["bar"] + assert d.keys("foo") == ["foo", "foobar"] def test_has_keys_with_prefix(self): - assert self.empty_dawg().has_keys_with_prefix('') == False + assert self.empty_dawg().has_keys_with_prefix("") == False d = self.dawg() - assert d.has_keys_with_prefix('') == True - assert d.has_keys_with_prefix('b') == True - assert d.has_keys_with_prefix('fo') == True - assert d.has_keys_with_prefix('bo') == False + assert d.has_keys_with_prefix("") == True + assert d.has_keys_with_prefix("b") == True + assert d.has_keys_with_prefix("fo") == True + assert d.has_keys_with_prefix("bo") == False def test_completion_dawg_saveload(self): buf = BytesIO() @@ -221,15 +219,15 @@ def test_completion_dawg_saveload(self): for key in self.keys: assert key in d - assert d.keys('foo') == ['foo', 'foobar'] - assert d.keys('b') == ['bar'] - assert d.keys('z') == [] + assert d.keys("foo") == ["foo", "foobar"] + assert d.keys("b") == ["bar"] + assert d.keys("z") == [] def test_no_segfaults_on_invalid_file(self): d = self.dawg() fd, path = tempfile.mkstemp() - with open(path, 'w') as f: - f.write('foo') + with open(path, "w") as f: + f.write("foo") with pytest.raises(IOError) as e: d.load(path) @@ -241,7 +239,7 @@ def test_no_segfaults_on_empty_dawg(self): class TestIntCompletionDAWGComplete(TestCompletionDAWG): - keys = ['f', 'bar', 'foo', 'foobar'] + keys = ["f", "bar", "foo", "foobar"] def dawg(self): return dawg.IntCompletionDAWG((k, len(k)) for k in self.keys) @@ -267,4 +265,4 @@ def test_iteritems(self): def test_items_prefix(self): d = self.dawg() - assert d.items('fo') == [('foo', 3), ('foobar', 6)] + assert d.items("fo") == [("foo", 3), ("foobar", 6)] diff --git a/tests/test_payload_dawg.py b/tests/test_payload_dawg.py index 305ac3f..d243a22 100644 --- a/tests/test_payload_dawg.py +++ b/tests/test_payload_dawg.py @@ -4,13 +4,14 @@ import pytest import dawg + class TestBytesDAWG(object): DATA = ( - ('foo', b'data3'), - ('bar', b'data2'), - ('foo', b'data1'), - ('foobar', b'data4') + ("foo", b"data3"), + ("bar", b"data2"), + ("foo", b"data1"), + ("foobar", b"data4"), ) DATA_KEYS = list(zip(*DATA))[0] @@ -23,29 +24,28 @@ def test_contains(self): for key, val in self.DATA: assert key in d - assert 'food' not in d - assert 'x' not in d - assert 'fo' not in d - + assert "food" not in d + assert "x" not in d + assert "fo" not in d def test_getitem(self): d = self.dawg() - assert d['foo'] == [b'data1', b'data3'] - assert d['bar'] == [b'data2'] - assert d['foobar'] == [b'data4'] + assert d["foo"] == [b"data1", b"data3"] + assert d["bar"] == [b"data2"] + assert d["foobar"] == [b"data4"] with pytest.raises(KeyError): - d['f'] + d["f"] with pytest.raises(KeyError): - d['food'] + d["food"] with pytest.raises(KeyError): - d['foobarz'] + d["foobarz"] with pytest.raises(KeyError): - d['x'] + d["x"] def test_prefixes(self): d = self.dawg() @@ -58,13 +58,13 @@ def test_keys(self): assert d.keys() == sorted(self.DATA_KEYS) def test_keys_ordering(self): - data = [('foo', b'v1'), ('foobar', b'v2'), ('bar', b'v3')] + data = [("foo", b"v1"), ("foobar", b"v2"), ("bar", b"v3")] - d = dawg.BytesDAWG(data, payload_separator=b'\xff') - assert d.keys() == ['bar', 'foobar', 'foo'] + d = dawg.BytesDAWG(data, payload_separator=b"\xff") + assert d.keys() == ["bar", "foobar", "foo"] - d2 = dawg.BytesDAWG(data, payload_separator=b'\x01') - assert d2.keys() == ['bar', 'foo', 'foobar'] + d2 = dawg.BytesDAWG(data, payload_separator=b"\x01") + assert d2.keys() == ["bar", "foo", "foobar"] def test_iterkeys(self): d = self.dawg() @@ -81,17 +81,16 @@ def test_iteritems(self): def test_build_error(self): with pytest.raises(dawg.Error): - self.dawg(payload_separator=b'f') - + self.dawg(payload_separator=b"f") class TestRecordDAWG(object): STRUCTURED_DATA = ( - ('foo', (3, 2, 256)), - ('bar', (3, 1, 0)), - ('foo', (3, 2, 1)), - ('foobar', (6, 3, 0)) + ("foo", (3, 2, 256)), + ("bar", (3, 1, 0)), + ("foo", (3, 2, 1)), + ("foobar", (6, 3, 0)), ) def dawg(self): @@ -99,9 +98,9 @@ def dawg(self): def test_record_getitem(self): d = self.dawg() - assert d['foo'] == [(3, 2, 1), (3, 2, 256)] - assert d['bar'] == [(3, 1, 0)] - assert d['foobar'] == [(6, 3, 0)] + assert d["foo"] == [(3, 2, 1), (3, 2, 256)] + assert d["bar"] == [(3, 1, 0)] + assert d["foobar"] == [(6, 3, 0)] def test_record_items(self): d = self.dawg() @@ -109,7 +108,12 @@ def test_record_items(self): def test_record_keys(self): d = self.dawg() - assert d.keys() == ['bar', 'foo', 'foo', 'foobar',] + assert d.keys() == [ + "bar", + "foo", + "foo", + "foobar", + ] def test_record_iterkeys(self): d = self.dawg() @@ -121,9 +125,9 @@ def test_record_iteritems(self): def test_record_keys_prefix(self): d = self.dawg() - assert d.keys('fo') == ['foo', 'foo', 'foobar'] - assert d.keys('bar') == ['bar'] - assert d.keys('barz') == [] + assert d.keys("fo") == ["foo", "foo", "foobar"] + assert d.keys("bar") == ["bar"] + assert d.keys("barz") == [] def test_prefixes(self): d = self.dawg() diff --git a/tests/test_prediction.py b/tests/test_prediction.py index cea8551..b2c69c5 100644 --- a/tests/test_prediction.py +++ b/tests/test_prediction.py @@ -3,45 +3,40 @@ import pytest import dawg + class TestPrediction(object): - DATA = ['ЁЖИК', 'ЁЖИКЕ', 'ЁЖ', 'ДЕРЕВНЯ', 'ДЕРЁВНЯ', 'ЕМ', 'ОЗЕРА', 'ОЗЁРА', 'ОЗЕРО'] + DATA = [ + "ЁЖИК", + "ЁЖИКЕ", + "ЁЖ", + "ДЕРЕВНЯ", + "ДЕРЁВНЯ", + "ЕМ", + "ОЗЕРА", + "ОЗЁРА", + "ОЗЕРО", + ] LENGTH_DATA = list(zip(DATA, ((len(w),) for w in DATA))) - REPLACES = dawg.DAWG.compile_replaces({'Е': 'Ё'}) + REPLACES = dawg.DAWG.compile_replaces({"Е": "Ё"}) SUITE = [ - ('УЖ', []), - ('ЕМ', ['ЕМ']), - ('ЁМ', []), - ('ЁЖ', ['ЁЖ']), - ('ЕЖ', ['ЁЖ']), - ('ЁЖИК', ['ЁЖИК']), - ('ЕЖИКЕ', ['ЁЖИКЕ']), - ('ДЕРЕВНЯ', ['ДЕРЕВНЯ', 'ДЕРЁВНЯ']), - ('ДЕРЁВНЯ', ['ДЕРЁВНЯ']), - ('ОЗЕРА', ['ОЗЕРА', 'ОЗЁРА']), - ('ОЗЕРО', ['ОЗЕРО']), + ("УЖ", []), + ("ЕМ", ["ЕМ"]), + ("ЁМ", []), + ("ЁЖ", ["ЁЖ"]), + ("ЕЖ", ["ЁЖ"]), + ("ЁЖИК", ["ЁЖИК"]), + ("ЕЖИКЕ", ["ЁЖИКЕ"]), + ("ДЕРЕВНЯ", ["ДЕРЕВНЯ", "ДЕРЁВНЯ"]), + ("ДЕРЁВНЯ", ["ДЕРЁВНЯ"]), + ("ОЗЕРА", ["ОЗЕРА", "ОЗЁРА"]), + ("ОЗЕРО", ["ОЗЕРО"]), ] - SUITE_ITEMS = [ - ( - it[0], # key - [ - (w, [(len(w),)]) # item, value pair - for w in it[1] - ] - ) - for it in SUITE - ] - - SUITE_VALUES = [ - ( - it[0], # key - [[(len(w),)] for w in it[1]] - ) - for it in SUITE - ] + SUITE_ITEMS = [(it[0], [(w, [(len(w),)]) for w in it[1]]) for it in SUITE] # key # item, value pair + SUITE_VALUES = [(it[0], [[(len(w),)] for w in it[1]]) for it in SUITE] # key @pytest.mark.parametrize(("word", "prediction"), SUITE) def test_dawg_prediction(self, word, prediction): @@ -63,52 +58,37 @@ def test_record_dawg_items_values(self, word, prediction): d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA) assert d.similar_item_values(word, self.REPLACES) == prediction + class TestMultiValuedPrediction(object): DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій".split(" ") LENGTH_DATA = list(zip(DATA, ((len(w),) for w in DATA))) - REPLACES = dawg.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'}) + REPLACES = dawg.DAWG.compile_replaces({"е": ["ё", "ѣ"], "и": "і"}) SUITE = [ - ('осел', []), - ('ель', ['ель']), - ('ёль', []), - ('хлеб', ['хлѣб']), - ('елка', ['ёлка']), - ('лесное', ['лѣсное']), - ('лесноё', []), - ('лёсное', []), - ('изобретен', ['изобрѣтён']), - ('беленая', ['бѣлёная']), - ('белёная', ['бѣлёная']), - ('бѣленая', ['бѣлёная']), - ('бѣлёная', ['бѣлёная']), - ('белѣная', []), - ('бѣлѣная', []), - ('все', ['всё', 'всѣ']), - ('лев', ['лев', 'лёв', 'лѣв']), - ('венский', ['вѣнскій']), - ] - - SUITE_ITEMS = [ - ( - it[0], # key - [ - (w, [(len(w),)]) # item, value pair - for w in it[1] - ] - ) - for it in SUITE + ("осел", []), + ("ель", ["ель"]), + ("ёль", []), + ("хлеб", ["хлѣб"]), + ("елка", ["ёлка"]), + ("лесное", ["лѣсное"]), + ("лесноё", []), + ("лёсное", []), + ("изобретен", ["изобрѣтён"]), + ("беленая", ["бѣлёная"]), + ("белёная", ["бѣлёная"]), + ("бѣленая", ["бѣлёная"]), + ("бѣлёная", ["бѣлёная"]), + ("белѣная", []), + ("бѣлѣная", []), + ("все", ["всё", "всѣ"]), + ("лев", ["лев", "лёв", "лѣв"]), + ("венский", ["вѣнскій"]), ] - SUITE_VALUES = [ - ( - it[0], # key - [[(len(w),)] for w in it[1]] - ) - for it in SUITE - ] + SUITE_ITEMS = [(it[0], [(w, [(len(w),)]) for w in it[1]]) for it in SUITE] # key # item, value pair + SUITE_VALUES = [(it[0], [[(len(w),)] for w in it[1]]) for it in SUITE] # key @pytest.mark.parametrize(("word", "prediction"), SUITE) def test_dawg_prediction(self, word, prediction):