-
Notifications
You must be signed in to change notification settings - Fork 3
/
CfA_Bib_Keywords.py
116 lines (97 loc) · 4.1 KB
/
CfA_Bib_Keywords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# -*- coding: utf-8 -*-
import requests
import json
import nltk
from nltk.tokenize import LineTokenizer
import csv
import time
import codecs
import cStringIO
import requests.packages.urllib3
requests.packages.urllib3.disable_warnings()
"""
url = 'https://api.adsabs.harvard.edu/v1/search/query/?q=bibcode:'+urllib.quote(i)+'&fl=bibcode,pubdate,aff,author,year,pub,title,abstract,keyword'
print url
headers = {'Authorization': 'Bearer '+devkey}
content = requests.get(url, headers=headers)
results = content.json()
k = results['response']['docs'][0]
"""
#enter numerical value for your starting date (month and year)
startYear = 2013
startMonth = 1
#enter numerical value for your ending date (month and year)
endYear = 2013
endMonth = 4
#location and file name of your ADS devkey
devkey = (open('dev_key.txt','r')).read()
#UnicodeWriter from http://docs.python.org/2/library/csv.html#examples
class UnicodeWriter:
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
data = self.queue.getvalue()
data = data.decode("utf-8")
data = self.encoder.encode(data)
self.stream.write(data)
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
#/end UnicodeWriter
realendMonth = endMonth+1
realendYear = endYear+1
daterange = ' - '+str(startMonth)+' '+str(startYear)+' to '+str(endMonth)+' '+str(endYear)
text = codecs.open('keywords'+daterange+'.txt','w')
for y in range(startYear,realendYear):
for m in range(startMonth,realendMonth): # first number is starting month, last number needs to be one more than final month
#url = 'http://labs.adsabs.harvard.edu/adsabs/api/search/?q=bibgroup:cfa,pubdate:'+str(y)+'-'+str(m)+'&rows=200&fl=keyword&fmt=json&dev_key='+str(devkey)
url = 'https://api.adsabs.harvard.edu/v1/search/query/?q=pubdate:'+str(y)+'-'+str(m)+'&rows=200&fl=bibcode,bibgroup:cfapubdate,aff,author,year,pub,title,abstract,keyword'
print url
headers = {'Authorization': 'Bearer '+devkey}
content = requests.get(url, headers=headers)
results = content.json()
k = results['response']['docs']
print results
for i in k:
try:
mywords=i['keyword']
myList = list(set(mywords)) #magic line that removes exact duplicates from an articles keywords
cleanList = u' '.join(myList)
unicode_list = (str(y)+'|'+str(m)+'|')+u'**'.join(myList)
clean_unicode = '\n'+unicode_list.replace('**','\n'+str(y)+'|'+str(m)+'|')
#print clean_unicode
text.write(clean_unicode.encode('utf-8'))
except KeyError:
pass
time.sleep(1)
text.close()
print 'finished getting keywords'
text = open('keywords'+daterange+'.txt','r').read()
freqlist = open('freqlist'+daterange+'.txt','wb')
lowertext = text.lower()
lines = LineTokenizer(blanklines='discard').tokenize(lowertext)
freq = nltk.FreqDist(lines)
#print freq
writer = csv.writer(freqlist, delimiter='|', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerows(freq.items())
freqlist.close()
print 'finished getting frequency distribution'
fileout = codecs.open('frequency'+daterange+'.csv', 'wb')
csv_out = csv.writer(fileout, lineterminator='\n', delimiter=',')
wr = UnicodeWriter(fileout,lineterminator='\n', delimiter=',', dialect='excel',quoting=csv.QUOTE_ALL)
wr.writerow(["Year","Month","Keyword","Frequency"])
f = codecs.open('freqlist'+daterange+'.txt')
for line in f:
vals = line.split('|')
words = [v.replace('\n', '') for v in vals]
words1 = [v.replace('"', '') for v in words]
#print words1
csv_out.writerow((words1[0], words1[1], words1[2], words1[3]))
f.close()
fileout.close()
print 'finished writing csv file'