-
Notifications
You must be signed in to change notification settings - Fork 0
/
summaryGen.py
85 lines (61 loc) · 2.55 KB
/
summaryGen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os,sys,subprocess
import re
import urllib
import json
from collections import defaultdict
def returnTop4URLs(query):
"""Method which queries Bing with string 'query' and returns the total number of hits for that particular query """
urlStringPart1 = "http://api.bing.net/json.aspx?AppId="
AppId = "9AAF5A7F25051B931FC1E0B32D74A6DF42727EE3"
urlStringPart2 = "&Version=2.2&Market=en-US&Query=" + query + "&Sources=web&Web.Count=20&JsonType=raw"
URL = urlStringPart1 + AppId + urlStringPart2
response = urllib.urlopen(URL)
receive = json.load(response)
numResults = receive['SearchResponse']['Web']['Total']
urlList = list()
i = 0
#import code;code.interact(local = locals())
while((i < 20) and (i < numResults) and (len(urlList) < 4)):
url = receive['SearchResponse']['Web']['Results'][i]['Url']
if url.endswith('.pdf') or url.endswith('.ppt'):
i = i + 1
continue
urlList.append(url)
i = i + 1
return urlList
def summarizeDB(directoryTraversalList, dbURL):
"""Method which generates content summary of a database based on a small 'sample' of the pages or documents in the database"""
for directory in directoryTraversalList:
try:
f1 = open(directory + '.txt','r')
except Exception:
print '\nNo directory-file by the name of ' + directory + ' exists, program will exit\n'
sys.exit('System will terminate')
rootLines = f1.readlines()
masterDict = defaultdict(int)
for i,line in enumerate(rootLines):
keyTerms = re.split('\W+',line.rstrip())
keyWords = '+'.join(keyTerms[1:])
urlList = returnTop4URLs(dbURL + keyWords)
print 'Fetching webpages with URLs corresponding to keywords: ' + ' '.join(keyTerms[1:]) + '\n'
#import code; code.interact(local = locals())
for url in urlList:
print '\tFetching URL :\t' + url + '\n'
fetchProcess = subprocess.Popen(['lynx','-dump',url],stdout = subprocess.PIPE)
textData,err = fetchProcess.communicate()
refPoint = textData.find('\nReferences\n')
preRefData = textData[:refPoint].lower()
wordList = re.findall('[a-zA-Z]+',preRefData)
for word in wordList:
masterDict[word] += 1
#print sorted(masterDict.items())
f2 = open('sample-' + directory + '.txt','w')
for word,count in sorted(masterDict.items()):
f2.write(word + ' ' + str(count) + '\n')
f1.close()
f2.close()
if __name__=="__main__":
dbURL = raw_input("\nPlease enter the database URL\n")
dbURL = re.sub('http:\\\\www\.','',dbURL)
dbURL = 'site:' + dbURL + ' '
summarizeDB(['testSummary'],dbURL)