forked from Morail/wiki-network
-
Notifications
You must be signed in to change notification settings - Fork 10
/
countwords-groups-sender.py
98 lines (72 loc) · 2.28 KB
/
countwords-groups-sender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python2.6
#coding=utf-8
"""
currently does not work
"""
import nltk
import sys
from operator import itemgetter
stopwords = nltk.corpus.stopwords.words('italian')
classes = ('anonymous', 'bot', 'bureaucrat', 'sysop', 'normal user')
tokenizer = nltk.PunktWordTokenizer()
## dictionary of frequency distributions
fd = dict(zip(classes, [nltk.FreqDist() for _ in range(len(classes))]))
dstpw = dict(zip(stopwords, [0] * len(stopwords)))
def freq_dist(cls, msg):
global fd
tokens = tokenizer.tokenize(nltk.clean_html(msg.lower()))
text = nltk.Text(t for t in tokens if len(t) > 2 and t not in dstpw)
fd[cls].update(text)
def iter_csv(filename, _hasHeader=False):
from csv import reader
fieldNames = None
print 'Reading from %s' % (filename,)
try:
cf = open(filename, 'rb')
except IOError, e:
print e
sys.exit(1)
try:
lines = reader(cf)
except IOError, e:
print e[0], e[1]
sys.exit(1)
if _hasHeader:
fieldNames = lines.next()
for row in lines:
d = {}
for i, f in enumerate(row):
if fieldNames:
d[fieldNames[i]] = f
else:
d[i] = f
yield d
cf.close()
def iter_roletext(iterator):
from itertools import imap
return imap(itemgetter("Owner's role", "original message"), iterator)
#return imap(itemgetter("Writer's role", "original message"), iterator)
def main():
from optparse import OptionParser
p = OptionParser(
usage="usage: %prog src_file dest_dir")
_, args = p.parse_args()
try:
src = args[0] # source file name
dest = args[1] # dest dir name
except IndexError:
p.error('Missing arguments')
for cls, text in (
(cls, text) for cls, text in iter_roletext(
e for e in iter_csv(src, True)
if e["template: welcome 1=yes; 0=no"] == "0"
) if cls):
freq_dist(cls, text)
for cls in classes:
with open("%s/%s.dat" %
(dest, cls.replace(' ', '_')), 'w') as out:
for k, v in sorted(fd[cls].items(), key=itemgetter(1),
reverse=True):
print >> out, v, k
if __name__ == "__main__":
main()