diff --git a/.gitignore b/.gitignore index 2aa3c05..88f936b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ config.yaml .python-version .tox/ *.swp +.vscode +.tool-versions diff --git a/docker-compose.yml b/docker-compose.yml index 3f3ad15..ec4c06d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,6 +20,8 @@ services: - .:/app - results:/results entrypoint: ["celery", "--app", "quarry.web.worker", "worker", ] + extra_hosts: + - "host.docker.internal:host-gateway" depends_on: - "db" - "redis" diff --git a/quarry/default_config.yaml b/quarry/default_config.yaml index e74d3ac..90cc3b0 100644 --- a/quarry/default_config.yaml +++ b/quarry/default_config.yaml @@ -15,12 +15,22 @@ task_acks_late: True # Tasks are idempotent! task_track_started: True worker_prefetch_multiplier: 1 # Tasks can run for a long time +# Run queries against the live wikimedia replica databases. This requires a +# toolforge account, so that you can retrieve your credentials from +# ~/replica.my.cnf (and log into toolforge to establish the SOCKS5 proxy). These +# lines should remain commented if you're not using a SOCKS5 proxy. The IP +# address of the host running the docker container. On Windows or macOS this +# might be `host.docker.internal`. +# REPLICA_SOCKS5_PROXY_HOST: '172.17.0.1' +# REPLICA_SOCKS5_PROXY_PORT: 1080 + + # Run queries against a fake wiki database -REPLICA_DOMAIN: '' -REPLICA_HOST: 'mywiki' -REPLICA_DB: 'mywiki_p' -REPLICA_USER: 'repl' -REPLICA_PASSWORD: 'repl' +# Change these 3 lines if you're using the live replicas. +REPLICA_DOMAIN: '' # Change to `analytics.db.svc.wikimedia.cloud` for live replicas +REPLICA_USER: 'repl' # For live replicas, your replica.my.cnf username +REPLICA_PASSWORD: 'repl' # For live replicas, your replica.my.cnf password + REPLICA_PORT: 3306 OUTPUT_PATH_TEMPLATE: '/results/%s/%s/%s.sqlite' REDIS_HOST: 'redis' diff --git a/quarry/web/replica.py b/quarry/web/replica.py index e02116f..9c2450a 100644 --- a/quarry/web/replica.py +++ b/quarry/web/replica.py @@ -1,4 +1,5 @@ import pymysql +import socks class ReplicaConnectionException(Exception): @@ -18,6 +19,7 @@ def _db_name_mangler(self): if self.dbname == "meta" or self.dbname == "meta_p": self.database_name = "s7" + self.database_p = "meta_p" elif self.dbname == "centralauth" or self.dbname == "centralauth_p": self.database_name = "s7" @@ -55,15 +57,29 @@ def connection(self, db): if self.config["REPLICA_DOMAIN"] else self.database_name ) - self._replica = pymysql.connect( - host=repl_host, - db=self.database_p, - user=self.config["REPLICA_USER"], - passwd=self.config["REPLICA_PASSWORD"], - port=self.config["REPLICA_PORT"], - charset="utf8", - client_flag=pymysql.constants.CLIENT.MULTI_STATEMENTS, - ) + connect_opts = { + "db": self.database_p, + "user": self.config["REPLICA_USER"], + "passwd": self.config["REPLICA_PASSWORD"], + "charset": "utf8", + "client_flag": pymysql.constants.CLIENT.MULTI_STATEMENTS, + } + + if not self.config.get("REPLICA_SOCKS5_PROXY_HOST"): + self._replica = pymysql.connect( + host=repl_host, port=self.config["REPLICA_PORT"], **connect_opts + ) + else: + self._replica = pymysql.connect(defer_connect=True, **connect_opts) + + sock = socks.socksocket() + sock.set_proxy( + socks.SOCKS5, + addr=self.config["REPLICA_SOCKS5_PROXY_HOST"], + port=self.config["REPLICA_SOCKS5_PROXY_PORT"], + ) + sock.connect((repl_host, self.config["REPLICA_PORT"])) + self._replica.connect(sock=sock) @connection.deleter def connection(self): diff --git a/requirements.txt b/requirements.txt index 3ebd09b..1ffa1f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,3 +31,4 @@ importlib-metadata==4.6.3 zipp==3.5.0 typing-extensions==3.10.0.0 flask_caching==2.0.2 +PySocks==1.7.1