From 15a94f5a323b34f3fd9843772cb9cd5d1a30cc5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Zochniak?= Date: Fri, 19 Jul 2024 13:23:58 +0200 Subject: [PATCH 1/2] Add prototype TwitterBatchScraper --- Makefile | 1 + lib/main.js | 2 +- lib/scrapers/twitter_batch.js | 229 ++++++++++++++++++++++++++++++++ src/main.iced | 1 + src/scrapers/twitter_batch.iced | 102 ++++++++++++++ 5 files changed, 334 insertions(+), 1 deletion(-) create mode 100644 lib/scrapers/twitter_batch.js create mode 100644 src/scrapers/twitter_batch.iced diff --git a/Makefile b/Makefile index 0dc5a43c..0a1d78e8 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,7 @@ $(BUILD_STAMP): \ lib/scrapers/hackernews.js \ lib/scrapers/reddit.js \ lib/scrapers/twitter.js \ + lib/scrapers/twitter_batch.js \ lib/scrapers/generic_social.js \ lib/team.js \ lib/team_hidden.js \ diff --git a/lib/main.js b/lib/main.js index c735a2a6..bf829bf6 100644 --- a/lib/main.js +++ b/lib/main.js @@ -2,7 +2,7 @@ (function() { var k, m, mods, v, _i, _len; - mods = [require('./web_service'), require('./b64extract'), require('./util'), require('./alloc'), require('./alloc3'), require('./constants'), require('./base'), require('./track'), require('./auth'), require('./update_passphrase_hash'), require('./update_settings'), require('./device'), require('./revoke'), require('./cryptocurrency'), require('./per_user_key'), require('./wallet'), require('./subkey'), require('./sibkey'), require('./eldest'), require('./pgp_update'), require('./announcement'), require('./scrapers/twitter'), require('./scrapers/facebook'), require('./scrapers/base'), require('./scrapers/github'), require('./scrapers/reddit'), require('./scrapers/generic_web_site'), require('./scrapers/dns'), require('./scrapers/coinbase'), require('./scrapers/hackernews'), require('./scrapers/generic_social'), require('./errors'), require('./wot')]; + mods = [require('./web_service'), require('./b64extract'), require('./util'), require('./alloc'), require('./alloc3'), require('./constants'), require('./base'), require('./track'), require('./auth'), require('./update_passphrase_hash'), require('./update_settings'), require('./device'), require('./revoke'), require('./cryptocurrency'), require('./per_user_key'), require('./wallet'), require('./subkey'), require('./sibkey'), require('./eldest'), require('./pgp_update'), require('./announcement'), require('./scrapers/twitter'), require('./scrapers/twitter_batch'), require('./scrapers/facebook'), require('./scrapers/base'), require('./scrapers/github'), require('./scrapers/reddit'), require('./scrapers/generic_web_site'), require('./scrapers/dns'), require('./scrapers/coinbase'), require('./scrapers/hackernews'), require('./scrapers/generic_social'), require('./errors'), require('./wot')]; for (_i = 0, _len = mods.length; _i < _len; _i++) { m = mods[_i]; diff --git a/lib/scrapers/twitter_batch.js b/lib/scrapers/twitter_batch.js new file mode 100644 index 00000000..2e432d83 --- /dev/null +++ b/lib/scrapers/twitter_batch.js @@ -0,0 +1,229 @@ +// Generated by IcedCoffeeScript 108.0.11 +(function() { + var BaseBearerToken, BaseScraper, Lock, TweetCache, TwitterBatchScraper, TwitterScraper, constants, decode_sig, iced, make_ids, schema, sncmp, urlmod, v_codes, ws_normalize, __iced_k, __iced_k_noop, _ref, + __hasProp = {}.hasOwnProperty, + __extends = function(child, parent) { for (var key in parent) { if (__hasProp.call(parent, key)) child[key] = parent[key]; } function ctor() { this.constructor = child; } ctor.prototype = parent.prototype; child.prototype = new ctor(); child.__super__ = parent.prototype; return child; }; + + iced = require('iced-runtime'); + __iced_k = __iced_k_noop = function() {}; + + _ref = require('./base'), sncmp = _ref.sncmp, BaseScraper = _ref.BaseScraper, BaseBearerToken = _ref.BaseBearerToken; + + TwitterScraper = require('./twitter').TwitterScraper; + + make_ids = require('../base').make_ids; + + constants = require('../constants').constants; + + v_codes = constants.v_codes; + + decode_sig = require('kbpgp').ukm.decode_sig; + + Lock = require('../util').Lock; + + urlmod = require('url'); + + schema = require('../schema3'); + + ws_normalize = function(x) { + var v; + v = x.split(/[\t\r\n ]+/); + if (v.length && v[0].length === 0) { + v.shift(); + } + if (v.length && v.slice(-1)[0].length === 0) { + v.pop(); + } + return v.join(' '); + }; + + exports.TwitterBatchScraper = TwitterBatchScraper = (function(_super) { + __extends(TwitterBatchScraper, _super); + + function TwitterBatchScraper(opts) { + this._tweet_cache = opts.tweet_cache; + this.cache_refresh_interval = opts.cache_refresh_interval; + TwitterBatchScraper.__super__.constructor.call(this, opts); + } + + TwitterBatchScraper.prototype._hunt_batch = function(cb) { + var created_at, err, i, id, json, query, rc, since_id, text, u, username, ___iced_passed_deferral, __iced_deferrals, __iced_k; + __iced_k = __iced_k_noop; + ___iced_passed_deferral = iced.findDeferral(arguments); + query = { + query: "\"Verifying myself\" \"Keybase.io\"", + expansions: "author_screen_name", + "user.fields": "url,username", + "tweet.fields": "created_at", + max_results: 60 + }; + if (since_id = this._tweet_cache.last_id) { + query.since_id = since_id; + } + u = urlmod.format({ + host: "api.twitter.com", + protocol: "https:", + pathname: "/2/tweets/search/recent", + query: query + }); + (function(_this) { + return (function(__iced_k) { + __iced_deferrals = new iced.Deferrals(__iced_k, { + parent: ___iced_passed_deferral, + filename: "/Users/michal/SourceCode/keybase/go/src/github.com/keybase/server_test_progs/proofs/src/scrapers/twitter_batch.iced", + funcname: "TwitterBatchScraper._hunt_batch" + }); + _this._get_body_api({ + url: u + }, __iced_deferrals.defer({ + assign_fn: (function() { + return function() { + err = arguments[0]; + rc = arguments[1]; + return json = arguments[2]; + }; + })(), + lineno: 44 + })); + __iced_deferrals._fulfill(); + }); + })(this)((function(_this) { + return function() { + var _i, _len, _ref1, _ref2; + _this.log("| search index " + u + " -> " + rc); + if (rc !== v_codes.OK) { + + } else if ((typeof json === "undefined" || json === null) || (json.length === 0)) { + rc = v_codes.EMPTY_JSON; + } else if (json.data == null) { + rc = v_codes.INVALID_JSON; + } else { + console.log(json.data); + _ref1 = json.data; + for (i = _i = 0, _len = _ref1.length; _i < _len; i = ++_i) { + _ref2 = _ref1[i], id = _ref2.id, created_at = _ref2.created_at, username = _ref2.username, text = _ref2.text; + created_at = new Date(created_at); + if (!isFinite(created_at)) { + _this.log("got invalid date in tweet JSON id: " + id + ", created_at: " + tweet.created_at); + continue; + } + _this.log("ingesting tweet: id: " + id + ", username: " + username + ", text: \"" + text + "\""); + _this._tweet_cache.inform({ + id: id, + created_at: created_at, + username: username, + text: text + }); + } + } + return cb(null, v_codes.OK); + }; + })(this)); + }; + + TwitterBatchScraper.prototype.hunt2 = function(_arg, cb) { + var api_url, current_tweet, err, human_url, name, now, out, proof_text_check, rc, remote_id, username, ___iced_passed_deferral, __iced_deferrals, __iced_k; + __iced_k = __iced_k_noop; + ___iced_passed_deferral = iced.findDeferral(arguments); + username = _arg.username, name = _arg.name, proof_text_check = _arg.proof_text_check; + (function(_this) { + return (function(__iced_k) { + __iced_deferrals = new iced.Deferrals(__iced_k, { + parent: ___iced_passed_deferral, + filename: "/Users/michal/SourceCode/keybase/go/src/github.com/keybase/server_test_progs/proofs/src/scrapers/twitter_batch.iced", + funcname: "TwitterBatchScraper.hunt2" + }); + _this._tweet_cache.lock.acquire(__iced_deferrals.defer({ + lineno: 63 + })); + __iced_deferrals._fulfill(); + }); + })(this)((function(_this) { + return function() { + err = null; + now = Math.floor(Date.now() / 1000); + (function(__iced_k) { + if (now - _this._tweet_cache.fetched_at > _this.cache_refresh_interval) { + _this._tweet_cache.fetched_at = now; + (function(__iced_k) { + __iced_deferrals = new iced.Deferrals(__iced_k, { + parent: ___iced_passed_deferral, + filename: "/Users/michal/SourceCode/keybase/go/src/github.com/keybase/server_test_progs/proofs/src/scrapers/twitter_batch.iced", + funcname: "TwitterBatchScraper.hunt2" + }); + _this._hunt_batch(__iced_deferrals.defer({ + assign_fn: (function() { + return function() { + err = arguments[0]; + return rc = arguments[1]; + }; + })(), + lineno: 68 + })); + __iced_deferrals._fulfill(); + })(function() { + return __iced_k(!err && rc !== v_codes.OK ? err = new Error("rc: " + rc) : void 0); + }); + } else { + return __iced_k(); + } + })(function() { + _this._tweet_cache.lock.release(); + if (err) { + _this.logl("error", "error when hunting batch: " + (err.toString())); + return cb(err); + } + out = {}; + rc = v_codes.NOT_FOUND; + current_tweet = _this._tweet_cache.tweets.get(username); + if (current_tweet && (_this.find_sig_in_tweet({ + inside: current_tweet.text, + proof_text_check: proof_text_check + })) === v_codes.OK) { + rc = v_codes.OK; + remote_id = current_tweet.id; + api_url = human_url = _this._id_to_url(username, remote_id); + out = { + remote_id: remote_id, + api_url: api_url, + human_url: human_url + }; + } + out.rc = rc; + return cb(err, out); + }); + }; + })(this)); + }; + + return TwitterBatchScraper; + + })(TwitterScraper); + + exports.TweetCache = TweetCache = (function() { + function TweetCache() { + this.tweets = new Map(); + this.last_id = null; + this.fetched_at = 0; + this.lock = new Lock(); + } + + TweetCache.prototype.inform = function(_arg) { + var created_at, current, id, text, username; + id = _arg.id, created_at = _arg.created_at, username = _arg.username, text = _arg.text; + current = this.tweets.get(username); + if (current && current.created_at >= created_at) { + return; + } + return this.tweets.set(username, { + id: id, + created_at: created_at, + text: text + }); + }; + + return TweetCache; + + })(); + +}).call(this); diff --git a/src/main.iced b/src/main.iced index 35d2eff5..79690251 100644 --- a/src/main.iced +++ b/src/main.iced @@ -22,6 +22,7 @@ mods = [ require('./pgp_update') require('./announcement') require('./scrapers/twitter') + require('./scrapers/twitter_batch') require('./scrapers/facebook') require('./scrapers/base') require('./scrapers/github') diff --git a/src/scrapers/twitter_batch.iced b/src/scrapers/twitter_batch.iced new file mode 100644 index 00000000..f5fe9760 --- /dev/null +++ b/src/scrapers/twitter_batch.iced @@ -0,0 +1,102 @@ +{sncmp,BaseScraper,BaseBearerToken} = require './base' +{TwitterScraper} = require './twitter' +{make_ids} = require '../base' +{constants} = require '../constants' +{v_codes} = constants +{decode_sig} = require('kbpgp').ukm +{Lock} = require '../util' +urlmod = require 'url' +schema = require '../schema3' + +#================================================================================ + +ws_normalize = (x) -> + v = x.split(/[\t\r\n ]+/) + v.shift() if v.length and v[0].length is 0 + v.pop() if v.length and v[-1...][0].length is 0 + v.join ' ' + +#================================================================================ + +exports.TwitterBatchScraper = class TwitterBatchScraper extends TwitterScraper + constructor: (opts) -> + @_tweet_cache = opts.tweet_cache + @cache_refresh_interval = opts.cache_refresh_interval + super opts + + _hunt_batch : (cb) -> + query = + query : "\"Verifying myself\" \"Keybase.io\"" + expansions: "author_screen_name" + "user.fields": "url,username" + "tweet.fields": "created_at" + max_results: 60 + if since_id = @_tweet_cache.last_id + # Do not fetch tweets that were already cached. + query.since_id = since_id + + u = urlmod.format { + host : "api.twitter.com" + protocol : "https:" + pathname : "/2/tweets/search/recent" + query + } + + await @_get_body_api { url : u }, defer err, rc, json + @log "| search index #{u} -> #{rc}" + if rc isnt v_codes.OK then #noop + else if not json? or (json.length is 0) then rc = v_codes.EMPTY_JSON + else if not json.data? then rc = v_codes.INVALID_JSON + else + console.log json.data + for {id, created_at, username, text}, i in json.data + created_at = new Date(created_at) + unless isFinite(created_at) + @log "got invalid date in tweet JSON id: #{id}, created_at: #{tweet.created_at}" + continue + @log "ingesting tweet: id: #{id}, username: #{username}, text: \"#{text}\"" + @_tweet_cache.inform { id, created_at, username, text } + + cb null, v_codes.OK + + hunt2 : ({username, name, proof_text_check}, cb) -> + # See if we should refresh cache. + await @_tweet_cache.lock.acquire defer() + err = null + now = Math.floor(Date.now() / 1000) + if now - @_tweet_cache.fetched_at > @cache_refresh_interval + @_tweet_cache.fetched_at = now + await @_hunt_batch defer err, rc + if not err and rc isnt v_codes.OK + err = new Error("rc: #{rc}") + @_tweet_cache.lock.release() + if err + @logl "error", "error when hunting batch: #{err.toString()}" + return cb err + + out = {} + rc = v_codes.NOT_FOUND + current_tweet = @_tweet_cache.tweets.get(username) + if current_tweet and (@find_sig_in_tweet { inside : current_tweet.text, proof_text_check }) is v_codes.OK + rc = v_codes.OK + remote_id = current_tweet.id + api_url = human_url = @_id_to_url username, remote_id + out = { remote_id, api_url, human_url } + out.rc = rc + cb err, out + +#================================================================================ + +exports.TweetCache = class TweetCache + constructor : () -> + @tweets = new Map() # username -> tweet + @last_id = null + @fetched_at = 0 + @lock = new Lock() + + inform : ({id, created_at, username, text}) -> + current = @tweets.get(username) + if current and current.created_at >= created_at + # We already have this tweet or more recent tweet for this user. + return + @tweets.set(username, { id, created_at, text }) From dd6b5264a94902f66c8863910b8ae7137e68bc2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Zochniak?= Date: Tue, 23 Jul 2024 11:47:52 +0200 Subject: [PATCH 2/2] Tweaks --- lib/scrapers/twitter_batch.js | 35 ++++++++++++++++----------------- src/scrapers/twitter_batch.iced | 19 +++++++++++------- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/lib/scrapers/twitter_batch.js b/lib/scrapers/twitter_batch.js index 2e432d83..b62e9b57 100644 --- a/lib/scrapers/twitter_batch.js +++ b/lib/scrapers/twitter_batch.js @@ -1,30 +1,24 @@ // Generated by IcedCoffeeScript 108.0.11 (function() { - var BaseBearerToken, BaseScraper, Lock, TweetCache, TwitterBatchScraper, TwitterScraper, constants, decode_sig, iced, make_ids, schema, sncmp, urlmod, v_codes, ws_normalize, __iced_k, __iced_k_noop, _ref, + var BaseBearerToken, BaseScraper, Lock, TweetCache, TwitterBatchScraper, TwitterScraper, constants, iced, urlmod, v_codes, ws_normalize, __iced_k, __iced_k_noop, _ref, __hasProp = {}.hasOwnProperty, __extends = function(child, parent) { for (var key in parent) { if (__hasProp.call(parent, key)) child[key] = parent[key]; } function ctor() { this.constructor = child; } ctor.prototype = parent.prototype; child.prototype = new ctor(); child.__super__ = parent.prototype; return child; }; iced = require('iced-runtime'); __iced_k = __iced_k_noop = function() {}; - _ref = require('./base'), sncmp = _ref.sncmp, BaseScraper = _ref.BaseScraper, BaseBearerToken = _ref.BaseBearerToken; + _ref = require('./base'), BaseScraper = _ref.BaseScraper, BaseBearerToken = _ref.BaseBearerToken; TwitterScraper = require('./twitter').TwitterScraper; - make_ids = require('../base').make_ids; - constants = require('../constants').constants; v_codes = constants.v_codes; - decode_sig = require('kbpgp').ukm.decode_sig; - Lock = require('../util').Lock; urlmod = require('url'); - schema = require('../schema3'); - ws_normalize = function(x) { var v; v = x.split(/[\t\r\n ]+/); @@ -83,25 +77,28 @@ return json = arguments[2]; }; })(), - lineno: 44 + lineno: 42 })); __iced_deferrals._fulfill(); }); })(this)((function(_this) { return function() { - var _i, _len, _ref1, _ref2; + var _i, _len, _ref1, _ref2, _ref3; _this.log("| search index " + u + " -> " + rc); if (rc !== v_codes.OK) { } else if ((typeof json === "undefined" || json === null) || (json.length === 0)) { rc = v_codes.EMPTY_JSON; } else if (json.data == null) { - rc = v_codes.INVALID_JSON; + if (((_ref1 = json.meta) != null ? _ref1.result_count : void 0) === 0) { + rc = v_codes.OK; + } else { + rc = v_codes.INVALID_JSON; + } } else { - console.log(json.data); - _ref1 = json.data; - for (i = _i = 0, _len = _ref1.length; _i < _len; i = ++_i) { - _ref2 = _ref1[i], id = _ref2.id, created_at = _ref2.created_at, username = _ref2.username, text = _ref2.text; + _ref2 = json.data; + for (i = _i = 0, _len = _ref2.length; _i < _len; i = ++_i) { + _ref3 = _ref2[i], id = _ref3.id, created_at = _ref3.created_at, username = _ref3.username, text = _ref3.text; created_at = new Date(created_at); if (!isFinite(created_at)) { _this.log("got invalid date in tweet JSON id: " + id + ", created_at: " + tweet.created_at); @@ -115,8 +112,10 @@ text: text }); } + rc = v_codes.OK; } - return cb(null, v_codes.OK); + _this.log("| _hunt_batch returning: " + rc); + return cb(null, rc); }; })(this)); }; @@ -134,7 +133,7 @@ funcname: "TwitterBatchScraper.hunt2" }); _this._tweet_cache.lock.acquire(__iced_deferrals.defer({ - lineno: 63 + lineno: 68 })); __iced_deferrals._fulfill(); }); @@ -158,7 +157,7 @@ return rc = arguments[1]; }; })(), - lineno: 68 + lineno: 73 })); __iced_deferrals._fulfill(); })(function() { diff --git a/src/scrapers/twitter_batch.iced b/src/scrapers/twitter_batch.iced index f5fe9760..d42f9bab 100644 --- a/src/scrapers/twitter_batch.iced +++ b/src/scrapers/twitter_batch.iced @@ -1,12 +1,9 @@ -{sncmp,BaseScraper,BaseBearerToken} = require './base' +{BaseScraper,BaseBearerToken} = require './base' {TwitterScraper} = require './twitter' -{make_ids} = require '../base' {constants} = require '../constants' {v_codes} = constants -{decode_sig} = require('kbpgp').ukm {Lock} = require '../util' urlmod = require 'url' -schema = require '../schema3' #================================================================================ @@ -25,6 +22,7 @@ exports.TwitterBatchScraper = class TwitterBatchScraper extends TwitterScraper super opts _hunt_batch : (cb) -> + # Make a query to find all keybase proofs since `last_id` (if present). query = query : "\"Verifying myself\" \"Keybase.io\"" expansions: "author_screen_name" @@ -46,9 +44,14 @@ exports.TwitterBatchScraper = class TwitterBatchScraper extends TwitterScraper @log "| search index #{u} -> #{rc}" if rc isnt v_codes.OK then #noop else if not json? or (json.length is 0) then rc = v_codes.EMPTY_JSON - else if not json.data? then rc = v_codes.INVALID_JSON + else if not json.data? + if json.meta?.result_count is 0 + # No results. + rc = v_codes.OK + else + # Unknown JSON structure. + rc = v_codes.INVALID_JSON else - console.log json.data for {id, created_at, username, text}, i in json.data created_at = new Date(created_at) unless isFinite(created_at) @@ -56,8 +59,10 @@ exports.TwitterBatchScraper = class TwitterBatchScraper extends TwitterScraper continue @log "ingesting tweet: id: #{id}, username: #{username}, text: \"#{text}\"" @_tweet_cache.inform { id, created_at, username, text } + rc = v_codes.OK - cb null, v_codes.OK + @log "| _hunt_batch returning: #{rc}" + cb null, rc hunt2 : ({username, name, proof_text_check}, cb) -> # See if we should refresh cache.