Skip to content

Commit

Permalink
Reformat code with black
Browse files Browse the repository at this point in the history
  • Loading branch information
insolor committed Sep 5, 2023
1 parent ffac51b commit d767d76
Show file tree
Hide file tree
Showing 9 changed files with 342 additions and 339 deletions.
2 changes: 1 addition & 1 deletion bench/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import absolute_import
210 changes: 115 additions & 95 deletions bench/speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,37 +7,37 @@
import os
import zipfile
import struct
#import pstats
#import cProfile

# import pstats
# import cProfile

import dawg


def words100k():
zip_name = os.path.join(
os.path.abspath(os.path.dirname(__file__)),
'words100k.txt.zip'
)
zip_name = os.path.join(os.path.abspath(os.path.dirname(__file__)), "words100k.txt.zip")
zf = zipfile.ZipFile(zip_name)
txt = zf.open(zf.namelist()[0]).read().decode('utf8')
txt = zf.open(zf.namelist()[0]).read().decode("utf8")
return txt.splitlines()


def random_words(num):
russian = 'абвгдеёжзиклмнопрстуфхцчъыьэюя'
alphabet = '%s%s' % (russian, string.ascii_letters)
return [
"".join([random.choice(alphabet) for x in range(random.randint(1,15))])
for y in range(num)
]
russian = "абвгдеёжзиклмнопрстуфхцчъыьэюя"
alphabet = "%s%s" % (russian, string.ascii_letters)
return ["".join([random.choice(alphabet) for x in range(random.randint(1, 15))]) for y in range(num)]


def truncated_words(words):
return [word[:3] for word in words]


def prefixes1k(words, prefix_len):
words = [w for w in words if len(w) >= prefix_len]
every_nth = int(len(words)/1000)
every_nth = int(len(words) / 1000)
_words = [w[:prefix_len] for w in words[::every_nth]]
return _words[:1000]


def leet_words(words, replaces):
for key, value in replaces.items():
words = [w.replace(key, value) for w in words]
Expand All @@ -53,75 +53,104 @@ def leet_words(words, replaces):
PREFIXES_15_1k = prefixes1k(WORDS100k, 15)

LEET_REPLACES = {
'o': '0',
'O': '0',
'u': '0',
'l': '1',
'i': '1',
'e': '3',
'E': '3',
'A': '4',
'a': '4',
'h': '4',
's': 'z',
"o": "0",
"O": "0",
"u": "0",
"l": "1",
"i": "1",
"e": "3",
"E": "3",
"A": "4",
"a": "4",
"h": "4",
"s": "z",
}
LEET_50k = leet_words(WORDS100k[:50000], LEET_REPLACES)


def format_result(key, value, text_width):
key = key.ljust(text_width)
print(" %s %s" % (key, value))


def bench(name, timer, descr='M ops/sec', op_count=0.1, repeats=3, runs=5,
text_width=33):
def bench(name, timer, descr="M ops/sec", op_count=0.1, repeats=3, runs=5, text_width=33):
try:
times = []
for x in range(runs):
times.append(timer.timeit(repeats))

def op_time(time):
return op_count*repeats / time
return op_count * repeats / time

val = "%0.3f%s" % (op_time(min(times)), descr)
format_result(name, val, text_width)
except (AttributeError, TypeError) as e:
format_result(name, "not supported", text_width)


def create_dawg():
words = words100k()
return dawg.DAWG(words)


def create_bytes_dawg():
words = words100k()
values = [struct.pack(str('<H'), len(word)) for word in words]
values = [struct.pack(str("<H"), len(word)) for word in words]
return dawg.BytesDAWG(zip(words, values))


def create_record_dawg():
words = words100k()
values = [ [len(word)] for word in words]
return dawg.RecordDAWG(str('<H'), zip(words, values))
values = [[len(word)] for word in words]
return dawg.RecordDAWG(str("<H"), zip(words, values))


def create_int_dawg():
words = words100k()
values = [len(word) for word in words]
return dawg.IntDAWG(zip(words, values))


def create_leet_dawg():
return dawg.DAWG(LEET_50k)


def benchmark():
print('\n====== Benchmarks (100k unique unicode words) =======\n')
print("\n====== Benchmarks (100k unique unicode words) =======\n")

tests = [
('__getitem__ (hits)', "for word in WORDS100k: data[word]", 'M ops/sec', 0.1, 3),
('get() (hits)', "for word in WORDS100k: data.get(word)", 'M ops/sec', 0.1, 3),
('get() (misses)', "for word in NON_WORDS_10k: data.get(word)", 'M ops/sec', 0.01, 5),
('__contains__ (hits)', "for word in WORDS100k: word in data", 'M ops/sec', 0.1, 3),
('__contains__ (misses)', "for word in NON_WORDS100k: word in data", 'M ops/sec', 0.1, 3),
('items()', 'list(data.items())', ' ops/sec', 1, 1),
('keys()', 'list(data.keys())', ' ops/sec', 1, 1),
# ('values()', 'list(data.values())', ' ops/sec', 1, 1),
(
"__getitem__ (hits)",
"for word in WORDS100k: data[word]",
"M ops/sec",
0.1,
3,
),
("get() (hits)", "for word in WORDS100k: data.get(word)", "M ops/sec", 0.1, 3),
(
"get() (misses)",
"for word in NON_WORDS_10k: data.get(word)",
"M ops/sec",
0.01,
5,
),
(
"__contains__ (hits)",
"for word in WORDS100k: word in data",
"M ops/sec",
0.1,
3,
),
(
"__contains__ (misses)",
"for word in NON_WORDS100k: word in data",
"M ops/sec",
0.1,
3,
),
("items()", "list(data.items())", " ops/sec", 1, 1),
("keys()", "list(data.keys())", " ops/sec", 1, 1),
# ('values()', 'list(data.values())', ' ops/sec', 1, 1),
]

common_setup = """
Expand All @@ -132,19 +161,19 @@ def benchmark():
NON_WORDS_10k = NON_WORDS100k[:10000]
NON_WORDS_1k = ['ыва', 'xyz', 'соы', 'Axx', 'avы']*200
"""
dict_setup = common_setup + 'data = dict((word, len(word)) for word in WORDS100k);'
dawg_setup = common_setup + 'data = create_dawg(); repl = data.compile_replaces(LEET_REPLACES);'
bytes_dawg_setup = common_setup + 'data = create_bytes_dawg();'
record_dawg_setup = common_setup + 'data = create_record_dawg();'
int_dawg_setup = common_setup + 'data = create_int_dawg();'
leet_dawg_setup = common_setup + 'data = create_leet_dawg(); repl = data.compile_replaces(LEET_REPLACES);'
dict_setup = common_setup + "data = dict((word, len(word)) for word in WORDS100k);"
dawg_setup = common_setup + "data = create_dawg(); repl = data.compile_replaces(LEET_REPLACES);"
bytes_dawg_setup = common_setup + "data = create_bytes_dawg();"
record_dawg_setup = common_setup + "data = create_record_dawg();"
int_dawg_setup = common_setup + "data = create_int_dawg();"
leet_dawg_setup = common_setup + "data = create_leet_dawg(); repl = data.compile_replaces(LEET_REPLACES);"

structures = [
('dict', dict_setup),
('DAWG', dawg_setup),
('BytesDAWG', bytes_dawg_setup),
('RecordDAWG', record_dawg_setup),
('IntDAWG', int_dawg_setup),
("dict", dict_setup),
("DAWG", dawg_setup),
("BytesDAWG", bytes_dawg_setup),
("RecordDAWG", record_dawg_setup),
("IntDAWG", int_dawg_setup),
]
for test_name, test, descr, op_count, repeats in tests:
for name, setup in structures:
Expand All @@ -161,79 +190,68 @@ def benchmark():
"for word in WORDS100k[:50000]: data.similar_keys(word, repl)",
setup=dawg_setup,
),
op_count=0.05
op_count=0.05,
)
bench(
"DAWG.similar_keys (l33t)",
timeit.Timer(
"for word in WORDS100k[:50000]: data.similar_keys(word, repl)",
setup=leet_dawg_setup,
),
op_count=0.05
op_count=0.05,
)

for struct_name, setup in structures[1:]:

# prefixes of a given key
_bench_data = [
('hits', 'WORDS100k'),
('mixed', 'MIXED_WORDS100k'),
('misses', 'NON_WORDS100k'),
("hits", "WORDS100k"),
("mixed", "MIXED_WORDS100k"),
("misses", "NON_WORDS100k"),
]

for meth in ['prefixes']:
for meth in ["prefixes"]:
for name, data in _bench_data:
bench(
'%s.%s (%s)' % (struct_name, meth, name),
timeit.Timer(
"for word in %s:\n"
" data.%s(word)" % (data, meth),
setup
),
"%s.%s (%s)" % (struct_name, meth, name),
timeit.Timer("for word in %s:\n" " data.%s(word)" % (data, meth), setup),
runs=3,
)

for meth in ['iterprefixes']:
for meth in ["iterprefixes"]:
for name, data in _bench_data:
bench(
'%s.%s (%s)' % (struct_name, meth, name),
"%s.%s (%s)" % (struct_name, meth, name),
timeit.Timer(
"for word in %s:\n"
" list(data.%s(word))" % (data, meth),
setup
"for word in %s:\n" " list(data.%s(word))" % (data, meth),
setup,
),
runs=3,
)

# keys with a given prefix
_bench_data = [
('xxx', 'avg_len(res)==415', 'PREFIXES_3_1k'),
('xxxxx', 'avg_len(res)==17', 'PREFIXES_5_1k'),
('xxxxxxxx', 'avg_len(res)==3', 'PREFIXES_8_1k'),
('xxxxx..xx', 'avg_len(res)==1.4', 'PREFIXES_15_1k'),
('xxx', 'NON_EXISTING', 'NON_WORDS_1k'),
("xxx", "avg_len(res)==415", "PREFIXES_3_1k"),
("xxxxx", "avg_len(res)==17", "PREFIXES_5_1k"),
("xxxxxxxx", "avg_len(res)==3", "PREFIXES_8_1k"),
("xxxxx..xx", "avg_len(res)==1.4", "PREFIXES_15_1k"),
("xxx", "NON_EXISTING", "NON_WORDS_1k"),
]
for xxx, avg, data in _bench_data:
for meth in ['keys', 'items']:
for meth in ["keys", "items"]:
bench(
'%s.%s(prefix="%s"), %s' % (struct_name, meth, xxx, avg),
timeit.Timer(
"for word in %s: data.%s(word)" % (data, meth),
setup
),
'K ops/sec',
timeit.Timer("for word in %s: data.%s(word)" % (data, meth), setup),
"K ops/sec",
op_count=1,
runs=3,
text_width=60,
)
for meth in ['iterkeys', 'iteritems']:
for meth in ["iterkeys", "iteritems"]:
bench(
'%s.%s(prefix="%s"), %s' % (struct_name, meth, xxx, avg),
timeit.Timer(
"for word in %s: list(data.%s(word))" % (data, meth),
setup
),
'K ops/sec',
timeit.Timer("for word in %s: list(data.%s(word))" % (data, meth), setup),
"K ops/sec",
op_count=1,
runs=3,
text_width=60,
Expand All @@ -247,10 +265,12 @@ def check_dawg(trie, words):
if value != len(words):
raise Exception()


def profiling():
import pstats
import cProfile
print('\n====== Profiling =======\n')

print("\n====== Profiling =======\n")
d = create_bytes_dawg()
WORDS = words100k()

Expand All @@ -260,19 +280,19 @@ def check_getitem(trie, words):

cProfile.runctx("check_getitem(d, WORDS)", globals(), locals(), "Profile.prof")

# def check_prefixes(trie, words):
# for word in words:
# trie.keys(word)
# cProfile.runctx("check_prefixes(d, NON_WORDS_1k)", globals(), locals(), "Profile.prof")
#
#cProfile.runctx("check_trie(d, WORDS)", globals(), locals(), "Profile.prof")
# def check_prefixes(trie, words):
# for word in words:
# trie.keys(word)
# cProfile.runctx("check_prefixes(d, NON_WORDS_1k)", globals(), locals(), "Profile.prof")
#
# cProfile.runctx("check_trie(d, WORDS)", globals(), locals(), "Profile.prof")

s = pstats.Stats("Profile.prof")
s.strip_dirs().sort_stats("time").print_stats(20)


if __name__ == '__main__':
if __name__ == "__main__":

benchmark()
#profiling()
print('\n~~~~~~~~~~~~~~\n')
# profiling()
print("\n~~~~~~~~~~~~~~\n")
Loading

0 comments on commit d767d76

Please sign in to comment.