Skip to content

Commit

Permalink
Download all core author papers in JSON
Browse files Browse the repository at this point in the history
  • Loading branch information
David Mora authored and David Mora committed Dec 4, 2017
1 parent 0cb23be commit 3fa6a17
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 10,156 deletions.
41 changes: 25 additions & 16 deletions data-aggregation/1-download-all-authors-papers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import csv
import time
import grequests
from pprint import pprint
from threading import Thread, RLock
Expand All @@ -15,12 +16,14 @@ def getAuthorJSON(response, *args, **kwargs):
mutex.acquire()
try:
authorJSONs.append(response.content)
print (json.loads(response.content)["name"])
print "DONE: " + response.url
finally:
mutex.release()

def exception_handler(request, exception):
print "ERROR: request failed: " + request.url
print "ERROR: request failed: " + request.url + "<<<<<<<"
print ("Request error: {0}".format(exception))


with open('core-authors-with-s2ids-FINAL.csv', 'r') as csvfile:
Expand All @@ -30,7 +33,7 @@ def exception_handler(request, exception):
s2Id = int(authorS2Pair[1])
authorToS2IdMap [name] = s2Id
authorToAutherURLMap[name] = "https://api.semanticscholar.org/v1/author/" + str(s2Id)
print name, s2Id
# print name, s2Id


unsentAuthorRequests = []
Expand All @@ -43,30 +46,36 @@ def exception_handler(request, exception):
# pprint(authorJSONs)

# PART 2: COMPILE ALL PAPERS, INDEXED BY PAPER NAME
paperJSONDict = {}
def getPaperJSON(response, *args, **kwargs):
mutex.acquire()
try:
authorJSONs.append(response.content)
print "DONE: " + response.url
paper = json.loads(response.content)
paperJSONDict[paper["title"]] = paper
print "PAPER " + " DONE: " + paper["title"]
finally:
mutex.release()

unsentPaperRequests = []
def getPaperURL(url):
return "https://api.semanticscholar.org/v1/paper/" + url[(url.find("paper/") + len("paper/")):]



for authorJSON in authorJSONs:
unsentPaperRequests = []
author = json.loads(authorJSON)
for paper in author["papers"]:
print paper["title"]
unsentPaperRequests.append(grequests.get(paper["url"], hooks={'response': getPaperJSON}))

grequests.map(unsentPaperRequests, exception_handler=exception_handler) # actually make requests

unsentPaperRequests.append(grequests.get(getPaperURL(paper["url"]), hooks={'response': getPaperJSON}))
grequests.map(unsentPaperRequests, exception_handler=exception_handler) # actually make requests
print author["name"] + " scheduled."
print "MAIN THREAD SLEEPING FOR 7 SEC before spawning new author requests..."
time.sleep(7)


# sampleAuthorURL = "https://api.semanticscholar.org/v1/author/1741101"
# samplePaperURL = "https://api.semanticscholar.org/v1/paper/0796f6cd7f0403a854d67d525e9b32af3b277331"
print "FINISHED grequests.map ..................."

# dict: author --> id
# for each author
# for each paper
# get paper JSON
# save to a giant JSON of all papers
paperJSONFile = open("papers-by-title.json", "w")
paperJSONFile.write(json.dumps(paperJSONDict))
paperJSONFile.close()
print "COMPLETE: json saved to disk"
91 changes: 91 additions & 0 deletions data-aggregation/add-author-or-link-array-to-existing-json.py

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions data-aggregation/core-authors-with-s2ids-FINAL.csv

This file was deleted.

20 changes: 19 additions & 1 deletion data-aggregation/core-authors-with-s2ids.csv
Original file line number Diff line number Diff line change
@@ -1 +1,19 @@

Angela Lee Duckworth,5720574
Barbara L. Fredrickson,1892780
Carol S. Dweck,2067522
Corey Lee M. Keyes,4171705
Dacher J Keltner,3990536
Daniel J. Siegel,24981109
Daniel T. Gilbert,3070698
Emma M Seppala,6140201
Frederic Luskin,4426823
Jennifer L. Aaker,6156620
Jon Kabat-Zinn,6262729
Kelly M McGonigal,5668084
Kristin D. Neff,4858703
Martin E. P. Seligman,3084765
Mihaly Csikszentmihalyi,3141129
Richard J. Davidson,1716527
Robert A. Emmons,4580744
Sonja Lyubomirsky,6986158
Zindel V. Segal,9031716
137 changes: 0 additions & 137 deletions data-aggregation/license.txt

This file was deleted.

Empty file.
Loading

0 comments on commit 3fa6a17

Please sign in to comment.