-
Notifications
You must be signed in to change notification settings - Fork 11
/
scrape_tweets.py
63 lines (49 loc) · 1.55 KB
/
scrape_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
Get live tweets and save to disk
"""
import os
import json
import datetime
from twitter import Api
RAW_TWEET_DIR = 'raw_tweet'
# maybe create raw_tweet dir
if not os.path.exists(RAW_TWEET_DIR):
os.makedirs(RAW_TWEET_DIR)
# retrieve credentials
with open('credentials.json') as j:
cred = json.load(j)
api = Api(cred['CONSUMER_KEY'], cred['CONSUMER_SECRET'],
cred['ACCESS_TOKEN'], cred['ACCESS_TOKEN_SECRET'])
def datetime_filename(prefix='output_'):
"""
creates filename with current datetime string suffix
"""
outputname = prefix + '{:%Y%m%d%H%M%S}utc.txt'.format(
datetime.datetime.utcnow())
return outputname
def scrape(tweets_per_file=100000):
"""
scrape live tweets. GetStreamSample() gets ~1,000 English
tweets per min, or 1.5 million/day
for easier reference, we save 100k tweets per file
"""
f = open(datetime_filename(prefix=RAW_TWEET_DIR+'/en_tweet_'), 'w')
tweet_count = 0
try:
for line in api.GetStreamSample():
if 'text' in line and line['lang'] == u'en':
text = line['text'].encode('utf-8').replace('\n', ' ')
f.write('{}\n'.format(text))
tweet_count += 1
if tweet_count % tweets_per_file == 0: # start new batch
f.close()
f = open(datetime_filename(prefix=RAW_TWEET_DIR+'/en_tweet_'), 'w')
continue
except KeyboardInterrupt:
print 'Twitter stream collection aborted'
finally:
f.close()
return tweet_count
if __name__ == '__main__':
tweet_count = scrape()
print 'A total of {} tweets collected'.format(tweet_count)