-
Notifications
You must be signed in to change notification settings - Fork 0
/
t3_4.py
82 lines (75 loc) · 2.97 KB
/
t3_4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python2
import collections
import multiprocessing
import functools
import string
from glob import glob
from pprint import pprint
def map_func(ignored, minlen, filename):
"""Read a file and return a sequence of (word, 1) values.
ignored: words which should be ignored
minlen: minimal length for word
filename: file for reading
"""
tr = string.maketrans(string.punctuation, ' ' * len(string.punctuation))
output = []
with open(filename, 'rt') as f:
for line in f:
line = line.translate(tr)
for word in line.split():
word = word.lower()
if (word.isalpha() and
word not in ignored and
len(word) >= minlen):
output.append((word, 1))
return output
def map_func_factory(ignored, minlen):
"""Provides map_func function with specified default arguments.
ignored: default ignored argument for map_func function
minlen: default minlen argument for map_func function
"""
func = functools.partial(map_func, ignored, minlen)
return func
def reduce_func(item):
"""Convert the partitioned data for a word to a
tuple containing the word and the number of occurances.
item: partitioned data for a word
"""
word, occurances = item
return (word, sum(occurances))
def partition(mapped_values):
"""Organize the mapped values by their key.
Returns an unsorted sequence of tuples with a key and a sequence of values.
mapped_values: sequence of mapped values (map_func output)
"""
partitioned_data = collections.defaultdict(list)
for key, value in mapped_values:
partitioned_data[key].append(value)
return partitioned_data.items()
def words_counter(path, ext=["*"], ignored=[], minlen=2,
num_workers=None, chunksize=1):
"""Counts the number of occurrences of each word in the specified files.
path: directory with files for reading
ext: extensions of files which should be read
ignored: words which should be ignored
minlen: minimal length for word
num_workers: the number of workers to create in the pool
chunksize: the portion of the input data to hand to each worker
"""
listmerge = lambda ll: reduce(lambda a,b: a + b, ll, [])
pool = multiprocessing.Pool(num_workers)
input_files = listmerge(map(lambda e: glob('%s*.%s' % (path, e)), ext))
new_map_func = map_func_factory(ignored, minlen)
map_responses = pool.map(new_map_func, input_files, chunksize=chunksize)
partitioned_data = partition(listmerge(map_responses))
reduced_values = pool.map(reduce_func, partitioned_data)
pool.close()
return reduced_values
# USING
if __name__ == '__main__':
exclude = ["if", "else", "on", "at"]
words = words_counter('/home/dzmitry/pycourse-dmoroz/pyttleship/',
ext=["py"],
ignored=exclude,
minlen=2)
pprint(words)