Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for using a SOCKS5 proxy to connect to live replica database #25

Merged
merged 2 commits into from
Oct 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ config.yaml
.python-version
.tox/
*.swp
.vscode
.tool-versions
2 changes: 2 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ services:
- .:/app
- results:/results
entrypoint: ["celery", "--app", "quarry.web.worker", "worker", ]
extra_hosts:
- "host.docker.internal:host-gateway"
audiodude marked this conversation as resolved.
Show resolved Hide resolved
depends_on:
- "db"
- "redis"
Expand Down
20 changes: 15 additions & 5 deletions quarry/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,22 @@ task_acks_late: True # Tasks are idempotent!
task_track_started: True
worker_prefetch_multiplier: 1 # Tasks can run for a long time

# Run queries against the live wikimedia replica databases. This requires a
# toolforge account, so that you can retrieve your credentials from
# ~/replica.my.cnf (and log into toolforge to establish the SOCKS5 proxy). These
# lines should remain commented if you're not using a SOCKS5 proxy. The IP
# address of the host running the docker container. On Windows or macOS this
# might be `host.docker.internal`.
# REPLICA_SOCKS5_PROXY_HOST: '172.17.0.1'
# REPLICA_SOCKS5_PROXY_PORT: 1080


# Run queries against a fake wiki database
REPLICA_DOMAIN: ''
REPLICA_HOST: 'mywiki'
REPLICA_DB: 'mywiki_p'
REPLICA_USER: 'repl'
REPLICA_PASSWORD: 'repl'
# Change these 3 lines if you're using the live replicas.
REPLICA_DOMAIN: '' # Change to `analytics.db.svc.wikimedia.cloud` for live replicas
REPLICA_USER: 'repl' # For live replicas, your replica.my.cnf username
REPLICA_PASSWORD: 'repl' # For live replicas, your replica.my.cnf password

REPLICA_PORT: 3306
OUTPUT_PATH_TEMPLATE: '/results/%s/%s/%s.sqlite'
REDIS_HOST: 'redis'
Expand Down
34 changes: 25 additions & 9 deletions quarry/web/replica.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pymysql
import socks


class ReplicaConnectionException(Exception):
Expand All @@ -18,6 +19,7 @@ def _db_name_mangler(self):

if self.dbname == "meta" or self.dbname == "meta_p":
self.database_name = "s7"

self.database_p = "meta_p"
elif self.dbname == "centralauth" or self.dbname == "centralauth_p":
self.database_name = "s7"
Expand Down Expand Up @@ -55,15 +57,29 @@ def connection(self, db):
if self.config["REPLICA_DOMAIN"]
else self.database_name
)
self._replica = pymysql.connect(
host=repl_host,
db=self.database_p,
user=self.config["REPLICA_USER"],
passwd=self.config["REPLICA_PASSWORD"],
port=self.config["REPLICA_PORT"],
charset="utf8",
client_flag=pymysql.constants.CLIENT.MULTI_STATEMENTS,
)
connect_opts = {
"db": self.database_p,
"user": self.config["REPLICA_USER"],
"passwd": self.config["REPLICA_PASSWORD"],
"charset": "utf8",
"client_flag": pymysql.constants.CLIENT.MULTI_STATEMENTS,
}

if not self.config.get("REPLICA_SOCKS5_PROXY_HOST"):
self._replica = pymysql.connect(
host=repl_host, port=self.config["REPLICA_PORT"], **connect_opts
)
else:
self._replica = pymysql.connect(defer_connect=True, **connect_opts)

sock = socks.socksocket()
sock.set_proxy(
socks.SOCKS5,
addr=self.config["REPLICA_SOCKS5_PROXY_HOST"],
port=self.config["REPLICA_SOCKS5_PROXY_PORT"],
)
sock.connect((repl_host, self.config["REPLICA_PORT"]))
self._replica.connect(sock=sock)

@connection.deleter
def connection(self):
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ importlib-metadata==4.6.3
zipp==3.5.0
typing-extensions==3.10.0.0
flask_caching==2.0.2
PySocks==1.7.1
Loading