Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Overlap and fetched at differences #2

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
190 changes: 139 additions & 51 deletions feedVersionsOverlap.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import time
import sys
import requests
import json
Expand All @@ -7,6 +8,7 @@

## threshold for detecting a tail
THRESHOLD = 0.15
APIKEY = sys.argv[1]

# remove tail end
def cleanTails(updatedScheduledService, end):
Expand Down Expand Up @@ -49,10 +51,9 @@ def findStartAndEndDates(updatedScheduledService):

# convert an dictionary of strings to a dictionary of datetime objects
def convertToDateTime(scheduled_service):

updatedScheduledService = {}
for date in scheduled_service:
official_date = datetime.datetime.strptime(str(date), '%Y-%m-%d')
official_date = toDateTime(date)
updatedScheduledService[official_date] = scheduled_service[date]

return updatedScheduledService
Expand All @@ -70,10 +71,10 @@ def findAverageServiceHours(updatedScheduledService):
# find and interpret schedule by
# determining correct start and end dates
# converting to datetime objects
def interpretSchedule(element):
def interpretSchedule(feedVersion, element):
if 'data' in element and not 'error' in element['data']:
sha1 = element['feed_version_sha1']
identification = element['id']
print "interpret:", sha1

scheduled_service = element['data']['scheduled_service']

Expand All @@ -84,34 +85,87 @@ def interpretSchedule(element):
updatedStart = cleanTails(updatedScheduledService, False)
updatedEnd = cleanTails(updatedScheduledService, True)

if not feedVersion.get('fetched_at'):
print "\tno fetched_at, skipping"
return
fetchedAt = toDateTime(feedVersion['fetched_at'][:10])
if (fetchedAt - updatedStart).days > 30:
print "\tset updatedStart %s to fetchedAt %s"%(updatedStart, fetchedAt)
updatedStart = fetchedAt

print "\tfetchedAt:", fetchedAt
print "\toriginalStart:", start
print "\tupdatedStart:", updatedStart
print "\tstart shift:", (updatedStart - start).days
print "\toriginalEnd:", end
print "\tupdatedEnd:", updatedEnd
print "\tend shift:", (updatedEnd - end).days
print "\toriginal duration:", (end - start).days
print "\tupdated duration:", (updatedEnd - updatedStart).days
print ""

rowInfo = {
"ID": identification,
"currentSha1": sha1,
"originalStart": start,
"originalEnd": end,
"updatedStart": updatedStart,
"updatedEnd": updatedEnd,
"fetchedAt": fetchedAt
}

if (sha1, id, updatedStart, updatedEnd):
if (sha1, updatedStart, updatedEnd):
return rowInfo

def writeToCSV (filename, status):

headerRow = ['ID', 'currentSha1', 'nextSha1', 'originalStart', 'originalEnd', 'updatedStart', 'updatedEnd', 'overlapStart',
'overlapEnd', 'overlapDifference', 'gapStart', 'gapEnd', 'gapDifference', 'startDifference']
def toDateTime(value):
return datetime.datetime.strptime(str(value), '%Y-%m-%d')

def writeToCSV (filename, status, header=None):
header = header or status[0].keys()
with open(filename, 'w') as f:
writer = csv.DictWriter(f, fieldnames=headerRow)
writer = csv.DictWriter(f, fieldnames=header)
writer.writeheader()
for elem in status:
writer.writerow(elem)


# find overlaps and gaps in feed versions
def findOverlap2(interpretedSchedule):
interpretedSchedule = sorted(interpretedSchedule, key = lambda x: (x['fetchedAt']))
status = []
for current,next_ in zip(interpretedSchedule[:-1], interpretedSchedule[1:]):
print "compare: %s -> %s"%(current['currentSha1'], next_['currentSha1'])
print "\tcurrent:"
print current
print "\tnext:"
print next_

overlapDays = (current['updatedEnd'] - next_['updatedStart']).days
totalTime = (next_['updatedEnd'] - current['updatedStart']).days
overlapPercent = float(overlapDays) / float(totalTime)
print "\toverlapDays:", overlapDays
print "\ttotalTime:", totalTime
print "\toverlapPercent:", overlapPercent
print ""

overlapObject = {
"currentSha1": current['currentSha1'],
"nextSha1": next_['currentSha1'],
"originalStart": current['originalStart'].strftime('%Y-%m-%d'),
"originalEnd": current['originalEnd'].strftime('%Y-%m-%d'),
"updatedStart": current['updatedStart'].strftime('%Y-%m-%d'),
"updatedEnd": current['updatedEnd'].strftime('%Y-%m-%d'),
"overlap": overlapDays,
"startDifference": (next_['updatedStart'] - current['updatedStart']).days,
"fetchedDifference": (next_['fetchedAt'] - current['fetchedAt']).days,
"overlapPercent": overlapPercent,
}
status.append(overlapObject)

return status

def findOverlap (interpretedSchedule):

interpretedSchedule = sorted(interpretedSchedule, key = lambda x: (x['updatedStart'], x['updatedEnd']))
interpretedSchedule = sorted(interpretedSchedule, key = lambda x: (x['fetchedAt']))

currentIndex = 0
nextIndex = 1
Expand All @@ -137,7 +191,6 @@ def findOverlap (interpretedSchedule):
# status.append("Overlap: " + str(difference) + " " + str(start) + " and " + str(end))

overlapObject = {
"ID": current['ID'],
"currentSha1": current['currentSha1'],
"nextSha1": next['currentSha1'],
"originalStart": current['originalStart'].strftime('%Y-%m-%d'),
Expand All @@ -164,7 +217,6 @@ def findOverlap (interpretedSchedule):
# status.append("Gap: " + str(difference) + " " + str(start) + " and " + str(end))

gapObject = {
"ID": current['ID'],
"currentSha1": current['currentSha1'],
"nextSha1": next['currentSha1'],
"originalStart": current['originalStart'].strftime('%Y-%m-%d'),
Expand Down Expand Up @@ -206,63 +258,99 @@ def findOverlap (interpretedSchedule):
return status, overlapAverage, gapAverage, startDifferenceAverage

# get feedversion with scheduled stops, and find overlap and gap averages for each feed
def getFeedService (onestop_id):
def getFeedVersions (onestop_id):
print "===== %s ====="%(onestop_id)
params = (
('feed_onestop_id', onestop_id),
('type', 'FeedVersionInfoStatistics'),
('per_page', 'false')
('per_page', 'false'),
('apikey', APIKEY)
)

reqService = requests.get('https://transit.land/api/v1/feed_version_infos/', params=params)
serviceJS = json.loads(reqService.text)
time.sleep(0.125)
feedVersionInfos = requests.get('https://transit.land/api/v1/feed_version_infos/', params=params).json()['feed_version_infos']

data = []
for feedVersionInfo in feedVersionInfos:
sha1 = feedVersionInfo['feed_version_sha1']
print "sha1:", sha1
params = {'apikey': APIKEY}

time.sleep(0.125)
feedVersion = requests.get('https://transit.land/api/v1/feed_versions/%s'%sha1, params=params).json()
if feedVersion is None:
print "\tno feed_version"
continue
if feedVersionInfo is None:
print "\tno feed_version_info"
continue
if feedVersion.get('tags') and feedVersion['tags'].get('gtfs_data_exchange'):
print "\tgtfs_data_exchange feed; skipping"
continue
data.append([feedVersion, feedVersionInfo])

interpretedSchedule = []
return data

for element in serviceJS['feed_version_infos']:
schedule = interpretSchedule(element)
def processFeed(onestop_id):
# Get feed_versions and feed_version_infos
fvs = getFeedVersions(onestop_id)

# Interpret and adjust the schedules
interpretedSchedules = []
for feedVersion, feedVersionInfo in fvs:
schedule = interpretSchedule(feedVersion, feedVersionInfo)
if schedule:
interpretedSchedule.append(schedule)
interpretedSchedules.append(schedule)
else:
print "\terror processing schedule; skipping"
continue

status, overlapAverage, gapAverage, startDifferenceAverage = findOverlap(interpretedSchedule)
writeToCSV("%s.csv"%onestop_id, status)
with open('%s.json'%onestop_id, 'w') as f:
f.write(json.dumps(status, default=lambda x:str(x)))
# Calculate overlap statistics
overlaps = findOverlap2(interpretedSchedules)
overlapPercentAverage = sum(i['overlapPercent'] for i in overlaps) / float(len(overlaps))
fetchedAtDifferenceAverage = sum(i['fetchedDifference'] for i in overlaps) / float(len(overlaps))

averageOneStopInformation = {
return {
'onestop_id': onestop_id,
'overlapAverage': overlapAverage,
'gapAverage': gapAverage,
'startDifferenceAverage': startDifferenceAverage
'overlapPercentAverage': overlapPercentAverage,
'fetchedAtDifferenceAverage': fetchedAtDifferenceAverage
}

print overlapAverage
print gapAverage
print startDifferenceAverage
return averageOneStopInformation

# status, overlapAverage, gapAverage, startDifferenceAverage = findOverlap(interpretedSchedule)
#
# header = ['ID', 'currentSha1', 'nextSha1', 'originalStart', 'originalEnd', 'updatedStart', 'updatedEnd', 'overlapStart',
# 'overlapEnd', 'overlapDifference', 'gapStart', 'gapEnd', 'gapDifference', 'startDifference']
# writeToCSV("%s.csv"%onestop_id, status, header=header)
# with open('%s.json'%onestop_id, 'w') as f:
# f.write(json.dumps(status, default=lambda x:str(x)))
#
# averageOneStopInformation = {
# 'onestop_id': onestop_id,
# 'overlapAverage': overlapAverage,
# 'gapAverage': gapAverage,
# 'startDifferenceAverage': startDifferenceAverage
# }
#
# print overlapAverage
# print gapAverage
# print startDifferenceAverage
# return averageOneStopInformation


# call function with onestop_id as parameter
def main():
per_page = int(sys.argv[1])
feeds = requests.get('https://transit.land/api/v1/feeds', params={'per_page': per_page}).json()['feeds']
params = {
'per_page': 'false',
'bbox': '-123.321533,36.826875,-120.786438,38.629745',
'apikey': APIKEY
}
time.sleep(0.125)
feeds = requests.get('https://transit.land/api/v1/feeds', params=params).json()['feeds']
allFeedsInformation = []
for feed in feeds:
print feed['onestop_id']
allFeedsInformation.append(getFeedService(feed['onestop_id']))

allFeedsInformation.append(processFeed(feed['onestop_id']))
filename = 'allFeeds4.csv'
headerRow = ['onestop_id', 'overlapAverage', 'gapAverage', 'startDifferenceAverage']

with open(filename, 'w') as f:
writer = csv.DictWriter(f, fieldnames=headerRow)
writer.writeheader()
for elem in allFeedsInformation:
writer.writerow(elem)


writeToCSV(filename, allFeedsInformation)


if __name__ == "__main__":
main()
main()