-
Notifications
You must be signed in to change notification settings - Fork 0
/
qproberGeneric.py
107 lines (71 loc) · 3.39 KB
/
qproberGeneric.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import re
import json
import urllib
import os,sys,subprocess
from collections import defaultdict
import summaryGen
def pingBing(query):
"""Method which queries Bing with string 'query' and returns the total number of hits for that particular query """
urlStringPart1 = "http://api.bing.net/json.aspx?AppId="
AppId = "9AAF5A7F25051B931FC1E0B32D74A6DF42727EE3"
urlStringPart2 = "&Version=2.2&Market=en-US&Query=" + query + "&Sources=web&JsonType=raw"
URL = urlStringPart1 + AppId + urlStringPart2
response = urllib.urlopen(URL)
receive = json.load(response)
return receive['SearchResponse']['Web']['Total']
def analyzeDB(directoryFile,dbURL,tc,ts,path):
try:
f1 = open(directoryFile)
except Exception:
return '->' + os.path.splitext(directoryFile)[0]
print repr(Exception)
print '\nProbing for Classification as ' + os.path.splitext(directoryFile)[0] + ' Database\n'
rootLines = f1.readlines()
f1.close()
subcategories = set()
for line in rootLines:
subcategories.add(line.split()[0])
subcategoriesList = list(subcategories)
coverage = [0]*len(subcategories)
specificity = [0.0]*len(subcategories)
for i,line in enumerate(rootLines):
keyTerms = re.split('\W+',line.rstrip())
keyWords = '+'.join(keyTerms[1:])
numOfHits= pingBing(dbURL + keyWords)
print '\nQuerying with ' + keyWords + '\n'
for i,subcat in enumerate(subcategoriesList):
if subcat in line:
coverage[i] += numOfHits
print '\n' + repr(subcategoriesList) + ' = ' + repr(coverage) + '\n'
totalCoverage = float(sum(coverage))
for i,subcat in enumerate(subcategoriesList):
specificity[i] = float(coverage[i])/totalCoverage
print '\n======================================\n'
print 'Database Stats\n'
print 'Coverage of ' + repr(subcategoriesList) + ' = ' + repr(coverage) + '\n'
print 'Specificity of '+ repr(subcategoriesList) + ' = ' + repr(specificity) + '\n'
print '\n======================================\n'
path = '->' + os.path.splitext(directoryFile)[0].lower() + path
if all(spec > ts for spec in specificity) and all(cover > tc for cover in coverage):
return analyzeDB('computers.txt',dbURL,tc,ts,path) + analyzeDB('sports.txt',dbURL,tc,ts,path) + analyzeDB('health.txt',dbURL,tc,ts,path)
elif (len([spec > ts for spec in specificity]) > 0) and (len([cover > tc for cover in coverage]) > 0):
for i,spec in enumerate(specificity):
if spec > ts and coverage[i] > tc:
return path + analyzeDB(subcategoriesList[i].lower() + '.txt',dbURL,tc,ts,'')
return path
if __name__== "__main__":
tc = -1 #Initial Coverage Threshold
ts = -1 #Initial Specificity
while tc < 1:
tc = raw_input("\nEnter the Coverage Threshold, tc > 1\n")
tc = float(tc)
while(ts < 0 or ts>1):
ts = raw_input("\nEnter the Specificity Threshold, 0 < ts < 1\n")
ts = float(ts)
dbURL = raw_input("\nPlease enter the database URL\n")
dbURL = re.sub('http:\\\\www\.','',dbURL)
dbURL = 'site:' + dbURL + ' '
analysis = analyzeDB('root.txt',dbURL,tc,ts,'')
print analysis
directoryList = analysis.split('->')[1:]
summaryGen.summarizeDB(directoryList,dbURL)