-
Notifications
You must be signed in to change notification settings - Fork 5
/
fwc.py
executable file
·60 lines (39 loc) · 1.92 KB
/
fwc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/python2.7
import argparse
import sys
from focused_web_crawler import FocusedWebCrawler
import logging
import code
import yaml
from constraint import Constraint
def main():
logger = logging.getLogger('data_big_bang.focused_web_crawler')
ap = argparse.ArgumentParser(description='Discover web resources associated with a site.')
ap.add_argument('input', metavar='input.yaml', type=str, nargs=1, help ='YAML file indicating the sites to crawl.')
ap.add_argument('output', metavar='output.yaml', type=str, nargs=1, help ='YAML file with the web resources discovered.')
args = ap.parse_args()
input = yaml.load(open(args.input[0], "rt"))
fwc = FocusedWebCrawler()
for e in input:
e.update({'constraint': Constraint()})
fwc.queue.put(e)
# fwc.queue.put({'url': 'http://www.linkedin.com', 'constraint': Constraint()})
# fwc.queue.put({'url': 'http://www.symantec.com', 'constraint': Constraint()})
# fwc.queue.put({'url': 'http://www.dell.com', 'key':'dell', 'constraint': Constraint()})
# fwc.queue.put({'url': 'http://www.apple.com', 'constraint': Constraint()})
# fwc.queue.put({'url': 'http://www.apple.com', 'constraint': Constraint()})
# fwc.queue.put({'url': 'http://www.matasano.com', 'constraint': Constraint()})
# fwc.queue.put({'url': 'http://www.basecamphq.com', 'key': 'basecamp', 'constraint': Constraint()})
# fwc.queue.put({'url': 'http://www.gnip.com', 'key':'gnip', 'constraint': Constraint()})
# fwc.queue.put({'key':'datasift', 'url': 'http://www.datasift.com', 'constraint': Constraint()})
# fwc.queue.put({'url': 'http://www.foundrygroup.com', 'constraint': Constraint()})
# fwc.queue.put({'url': 'http://www.avc.com', 'constraint': Constraint()})
fwc.start()
fwc.join()
with open(args.output[0], "wt") as s:
yaml.dump(fwc.collection, s, default_flow_style = False)
# locals = globals()
# locals.update({'fwc':fwc})
# code.InteractiveConsole(locals = locals).interact()
if __name__ == '__main__':
main()