-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_flick_query.py
255 lines (207 loc) · 12.5 KB
/
run_flick_query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#Image querying script written by Tamara Berg,
#and extended heavily James Hays
#9/26/2007 added dynamic timeslices to query more efficiently.
#8/18/2008 added new fields and set maximum time slice.
#8/19/2008 this is a much simpler function which gets ALL geotagged photos of
# sufficient accuracy. No queries, no negative constraints.
# divides up the query results into multiple files
# 1/5/2009
# now uses date_taken instead of date_upload to get more diverse blocks of images
# 1/13/2009 - uses the original im2gps keywords, not as negative constraints though
import sys, string, math, time, socket
from flickrapi2 import FlickrAPI
from datetime import datetime
from calendar import calendar
from flickr_api_key import *
def convertDate(date_string):
tokens = date_string.split('/')
year = int(tokens[2])
month = int(tokens[1])
day = int(tokens[0])
d = datetime.datetime(year, month, day, 0, 0)
return calendar.timegm(d.utctimetuple())
def run_flickr_query(query_string, max_photos = 1000, startDate = "1/1/2010", endDate = "31/12/2011"):
socket.setdefaulttimeout(30) #30 second time out on sockets before they throw
#an exception. I've been having trouble with urllib.urlopen hanging in the
#flickr API. This will show up as exceptions.IOError.
#the time out needs to be pretty long, it seems, because the flickr servers can be slow
#to respond to our big searches.
###########################################################################
# Modify this section to reflect your data and specific search
###########################################################################
# flickr auth information:
# change these to your flickr api keys and secret
# make a new FlickrAPI instance
fapi = FlickrAPI(flickrAPIKey, flickrSecret)
print '\n\nquery_string is ' + query_string
total_images_queried = 0;
# number of seconds to skip per query
#timeskip = 62899200 #two years
timeskip = 604800 #one week
#timeskip = 172800 #two days
#timeskip = 86400 #one day
#timeskip = 3600 #one hour
#timeskip = 2257 #for resuming previous query
#mintime = 1121832000 #from im2gps
#mintime = 1167407788 # resume crash england
#mintime = 1177828976 #resume crash japan
#mintime = 1187753798 #resume crash greece
#mintime = 1171416400 #resume crash WashingtonDC
mintime = 1287878400 # 10/24/2010
maxtime = mintime+timeskip
#endtime = 1192165200 #10/12/2007, at the end of im2gps queries
endtime = 1351100325 # 10/24/2012
#this is the desired number of photos in each block
desired_photos = 250
print "hola"
print datetime.fromtimestamp(mintime)
print datetime.fromtimestamp(endtime)
total_image_num = 0
while (maxtime < endtime):
#new approach - adjust maxtime until we get the desired number of images
#within a block. We'll need to keep upper bounds and lower
#lower bound is well defined (mintime), but upper bound is not. We can't
#search all the way from endtime.
lower_bound = mintime + 900 #lower bound OF the upper time limit. must be at least 15 minutes or zero results
upper_bound = mintime + timeskip * 20 #upper bound of the upper time limit
maxtime = .95 * lower_bound + .05 * upper_bound
print '\nBinary search on time range upper bound'
print 'Lower bound is ' + str(datetime.fromtimestamp(lower_bound))
print 'Upper bound is ' + str(datetime.fromtimestamp(upper_bound))
if total_image_num > max_photos:
print 'Number of photos %d > %d limit.' % (total_image_num, max_photos)
break
keep_going = 6 #search stops after a fixed number of iterations
while( keep_going > 0 and maxtime < endtime):
try:
rsp = fapi.photos_search(api_key=flickrAPIKey,
ispublic="1",
media="photos",
per_page="250",
page="1",
#has_geo = "1", #bbox="-180, -90, 180, 90",
text=query_string,
#accuracy="6", #6 is region level. most things seem 10 or better.
min_upload_date=str(mintime),
max_upload_date=str(maxtime))
##min_taken_date=str(datetime.fromtimestamp(mintime)),
##max_taken_date=str(datetime.fromtimestamp(maxtime)))
#we want to catch these failures somehow and keep going.
time.sleep(1)
fapi.testFailure(rsp)
total_images = rsp.photos[0]['total'];
null_test = int(total_images); #want to make sure this won't crash later on for some reason
null_test = float(total_images);
print '\nnumimgs: ' + total_images
print 'mintime: ' + str(mintime) + ' maxtime: ' + str(maxtime) + ' timeskip: ' + str(maxtime - mintime)
if( int(total_images) > desired_photos ):
print 'too many photos in block, reducing maxtime'
upper_bound = maxtime
maxtime = (lower_bound + maxtime) / 2 #midpoint between current value and lower bound.
if( int(total_images) < desired_photos):
print 'too few photos in block, increasing maxtime'
lower_bound = maxtime
maxtime = (upper_bound + maxtime) / 2
print 'Lower bound is ' + str(datetime.fromtimestamp(lower_bound))
print 'Upper bound is ' + str(datetime.fromtimestamp(upper_bound))
if( int(total_images) > 0): #only if we're not in a degenerate case
keep_going = keep_going - 1
else:
upper_bound = upper_bound + timeskip;
except KeyboardInterrupt:
print('Keyboard exception while querying for images, exiting\n')
raise
except:
print sys.exc_info()[0]
#print type(inst) # the exception instance
#print inst.args # arguments stored in .args
#print inst # __str__ allows args to printed directly
print ('Exception encountered while querying for images\n')
#end of while binary search
print 'finished binary search'
s = '\nmintime: ' + str(mintime) + ' maxtime: ' + str(maxtime)
print s
out_file.write(s + '\n')
i = getattr(rsp,'photos',None)
if i:
s = 'numimgs: ' + total_images
print s
out_file.write(s + '\n')
current_image_num = 1;
num = int(rsp.photos[0]['pages'])
s = 'total pages: ' + str(num)
print s
out_file.write(s + '\n')
#only visit 16 pages max, to try and avoid the dreaded duplicate bug
#16 pages = 4000 images, should be duplicate safe. Most interesting pictures will be taken.
num_visit_pages = min(16,num)
s = 'visiting only ' + str(num_visit_pages) + ' pages ( up to ' + str(num_visit_pages * 250) + ' images)'
print s
out_file.write(s + '\n')
total_images_queried = total_images_queried + min((num_visit_pages * 250), int(total_images))
#print 'stopping before page ' + str(int(math.ceil(num/3) + 1)) + '\n'
pagenum = 1;
while( pagenum <= num_visit_pages ):
#for pagenum in range(1, num_visit_pages + 1): #page one is searched twice
print ' page number ' + str(pagenum)
try:
rsp = fapi.photos_search(api_key=flickrAPIKey,
ispublic="1",
media="photos",
per_page="250",
page=str(pagenum),
sort="interestingness-desc",
#has_geo = "1", #bbox="-180, -90, 180, 90",
text=query_string,
#accuracy="6", #6 is region level. most things seem 10 or better.
extras = "tags, original_format, license, geo, date_taken, date_upload, o_dims, views",
min_upload_date=str(mintime),
max_upload_date=str(maxtime))
##min_taken_date=str(datetime.fromtimestamp(mintime)),
##max_taken_date=str(datetime.fromtimestamp(maxtime)))
time.sleep(1)
fapi.testFailure(rsp)
except KeyboardInterrupt:
print('Keyboard exception while querying for images, exiting\n')
raise
except:
print sys.exc_info()[0]
#print type(inst) # the exception instance
#print inst.args # arguments stored in .args
#print inst # __str__ allows args to printed directly
print ('Exception encountered while querying for images\n')
else:
# and print them
k = getattr(rsp,'photos',None)
if k:
m = getattr(rsp.photos[0],'photo',None)
if m:
for b in rsp.photos[0].photo:
if b!=None:
out_file.write('photo: ' + b['id'] + ' ' + b['secret'] + ' ' + b['server'] + ' ' + b['farm'] + '\n')
out_file.write('owner: ' + b['owner'] + '\n')
out_file.write('title: ' + b['title'].encode("ascii","replace") + '\n')
out_file.write('originalsecret: ' + b['originalsecret'] + '\n')
out_file.write('originalformat: ' + b['originalformat'] + '\n')
out_file.write('o_height: ' + b['o_height'] + '\n')
out_file.write('o_width: ' + b['o_width'] + '\n')
out_file.write('datetaken: ' + b['datetaken'].encode("ascii","replace") + '\n')
out_file.write('dateupload: ' + b['dateupload'].encode("ascii","replace") + '\n')
out_file.write('tags: ' + b['tags'].encode("ascii","replace") + '\n')
out_file.write('license: ' + b['license'].encode("ascii","replace") + '\n')
out_file.write('latitude: ' + b['latitude'].encode("ascii","replace") + '\n')
out_file.write('longitude: ' + b['longitude'].encode("ascii","replace") + '\n')
out_file.write('accuracy: ' + b['accuracy'].encode("ascii","replace") + '\n')
out_file.write('views: ' + b['views'] + '\n')
out_file.write('interestingness: ' + str(current_image_num) + ' out of ' + str(total_images) + '\n');
out_file.write('\n')
current_image_num = current_image_num + 1;
total_image_num = total_image_num + 1;
pagenum = pagenum + 1; #this is in the else exception block. It won't increment for a failure.
#this block is indented such that it will only run if there are no exceptions
#in the original query. That means if there are exceptions, mintime won't be incremented
#and it will try again
timeskip = maxtime - mintime #used for initializing next binary search
mintime = maxtime
out_file.write('Total images queried: ' + str(total_images_queried) + '\n')
out_file.close