-
Notifications
You must be signed in to change notification settings - Fork 57
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP: Add prototype TwitterBatchScraper #202
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
{BaseScraper,BaseBearerToken} = require './base' | ||
{TwitterScraper} = require './twitter' | ||
{constants} = require '../constants' | ||
{v_codes} = constants | ||
{Lock} = require '../util' | ||
urlmod = require 'url' | ||
|
||
#================================================================================ | ||
|
||
ws_normalize = (x) -> | ||
v = x.split(/[\t\r\n ]+/) | ||
v.shift() if v.length and v[0].length is 0 | ||
v.pop() if v.length and v[-1...][0].length is 0 | ||
v.join ' ' | ||
|
||
#================================================================================ | ||
|
||
exports.TwitterBatchScraper = class TwitterBatchScraper extends TwitterScraper | ||
constructor: (opts) -> | ||
@_tweet_cache = opts.tweet_cache | ||
@cache_refresh_interval = opts.cache_refresh_interval | ||
super opts | ||
|
||
_hunt_batch : (cb) -> | ||
# Make a query to find all keybase proofs since `last_id` (if present). | ||
query = | ||
query : "\"Verifying myself\" \"Keybase.io\"" | ||
expansions: "author_screen_name" | ||
"user.fields": "url,username" | ||
"tweet.fields": "created_at" | ||
max_results: 60 | ||
if since_id = @_tweet_cache.last_id | ||
# Do not fetch tweets that were already cached. | ||
query.since_id = since_id | ||
|
||
u = urlmod.format { | ||
host : "api.twitter.com" | ||
protocol : "https:" | ||
pathname : "/2/tweets/search/recent" | ||
query | ||
} | ||
|
||
await @_get_body_api { url : u }, defer err, rc, json | ||
@log "| search index #{u} -> #{rc}" | ||
if rc isnt v_codes.OK then #noop | ||
else if not json? or (json.length is 0) then rc = v_codes.EMPTY_JSON | ||
else if not json.data? | ||
if json.meta?.result_count is 0 | ||
# No results. | ||
rc = v_codes.OK | ||
else | ||
# Unknown JSON structure. | ||
rc = v_codes.INVALID_JSON | ||
else | ||
for {id, created_at, username, text}, i in json.data | ||
created_at = new Date(created_at) | ||
unless isFinite(created_at) | ||
@log "got invalid date in tweet JSON id: #{id}, created_at: #{tweet.created_at}" | ||
continue | ||
@log "ingesting tweet: id: #{id}, username: #{username}, text: \"#{text}\"" | ||
@_tweet_cache.inform { id, created_at, username, text } | ||
rc = v_codes.OK | ||
|
||
@log "| _hunt_batch returning: #{rc}" | ||
cb null, rc | ||
|
||
hunt2 : ({username, name, proof_text_check}, cb) -> | ||
# See if we should refresh cache. | ||
await @_tweet_cache.lock.acquire defer() | ||
err = null | ||
now = Math.floor(Date.now() / 1000) | ||
if now - @_tweet_cache.fetched_at > @cache_refresh_interval | ||
@_tweet_cache.fetched_at = now | ||
await @_hunt_batch defer err, rc | ||
if not err and rc isnt v_codes.OK | ||
err = new Error("rc: #{rc}") | ||
@_tweet_cache.lock.release() | ||
if err | ||
@logl "error", "error when hunting batch: #{err.toString()}" | ||
return cb err | ||
|
||
out = {} | ||
rc = v_codes.NOT_FOUND | ||
current_tweet = @_tweet_cache.tweets.get(username) | ||
if current_tweet and (@find_sig_in_tweet { inside : current_tweet.text, proof_text_check }) is v_codes.OK | ||
rc = v_codes.OK | ||
remote_id = current_tweet.id | ||
api_url = human_url = @_id_to_url username, remote_id | ||
out = { remote_id, api_url, human_url } | ||
out.rc = rc | ||
cb err, out | ||
|
||
#================================================================================ | ||
|
||
exports.TweetCache = class TweetCache | ||
constructor : () -> | ||
@tweets = new Map() # username -> tweet | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. bad idea, should keep multiple tweets per twitter user and pick the correct one for given hunt |
||
@last_id = null | ||
@fetched_at = 0 | ||
@lock = new Lock() | ||
|
||
inform : ({id, created_at, username, text}) -> | ||
current = @tweets.get(username) | ||
if current and current.created_at >= created_at | ||
# We already have this tweet or more recent tweet for this user. | ||
return | ||
@tweets.set(username, { id, created_at, text }) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be problematic during initial run if there were more than new 60 proofs last week.