-
Notifications
You must be signed in to change notification settings - Fork 1
/
tlgcatgraph.py
103 lines (92 loc) · 4.11 KB
/
tlgcatgraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/python
# task list generator - interface to catgraph
import time
import requests
from gp import *
from utils import *
def FindCGHost(graphname):
r= requests.get('http://sylvester/hostmap/%s' % graphname)
if r.status_code==200:
return r.text
return None
class CatGraphInterface:
def __init__(self, host='ortelius.toolserver.org', port=6666, graphname=None):
self.gp= client.Connection( client.ClientTransport(host, port), graphname )
self.gp.connect()
self.graphname= graphname
self.wikiname= graphname + '_p'
def getPagesInCategory(self, category, depth=2):
catID= getCategoryID(self.wikiname, category)
if catID!=None:
result= []
successors= self.gp.capture_traverse_successors(catID, depth)
if successors: # result can be None for empty categories
# convert list of tuples to simple list. is there a faster (i.e. built-in) way to do this?
for i in successors:
result.append(i[0])
return result
else:
# category not found.
raise InputValidationError(_('Category %s not found in database %s.') % (category, self.wikiname))
## execute a search engine-style string
# operators '+' (intersection) and '-' (difference) are supported
# e. g. "Biology; Art; +Apes; -Cats" searches for everything in Biology or Art and in Apes, not in Cats
# search parameters are evaluated from left to right, i.e. results might differ depending on order.
# on the first category, any '+' operator is ignored, while a '-' operator yields an empty result.
# the "depth" parameter is applied to each category.
# @param string The search string.
# @param depth The search depth.
# --- this method isn't used in ALG any more, see tlgbackend.py/evalQueryString instead
def executeSearchString(self, string, depth):
# todo: something like "Category|3" to override search depth
# todo: it would be cool to have this command in graphcore, possibly using threads for each category.
result= set()
n= 0
for param in string.split(';'):
param= param.strip()
if len(param)==0:
raise InputValidationError(_('Empty category name specified.'))
if param[0] in '+-':
category= param[1:].strip().replace(' ', '_')
op= param[0]
else:
category= param.replace(' ', '_')
op= '|'
if op=='|':
result|= set(self.getPagesInCategory(category, depth))
dprint(2, ' | "%s"' % category)
elif op=='+':
if n==0:
# '+' on first category should do the expected thing
result|= set(self.getPagesInCategory(category, depth))
dprint(2, ' | "%s"' % category)
else:
result&= set(self.getPagesInCategory(category, depth))
dprint(2, ' & "%s"' % category)
elif op=='-':
# '-' on first category has no effect
if n!=0:
result-= set(self.getPagesInCategory(category, depth))
dprint(2, ' - "%s"' % category)
n+= 1
return list(result)
if __name__ == '__main__':
cg= CatGraphInterface(graphname='dewiki')
#~ catID= getCategoryID(cg.wikiname, '!Hauptkategorie')
#~ print cg.gp.capture_traverse_successors(catID, 1)
depth= 5
t= time.time()
for category in ['Biologie', 'Katzen', 'Foo', 'Astrobiologie']:
catID= getCategoryID('dewiki_p', category)
if catID:
cg.gp.capture_traverse_successors(catID, depth)
traw= time.time()-t
search= '+Biologie; -Katzen; -Astrobiologie; Foo'
print "searching for '%s'..." % search
sys.stdout.flush()
t= time.time()
set= cg.executeSearchString(search, depth)
#print set
print "search found %d pages" % (len(set))
tcooked= time.time()-t
print "traw: %s tcooked: %s" % (traw, tcooked)