From 8bbf33397b2ab29f413f5e63413170a8170642d5 Mon Sep 17 00:00:00 2001 From: Rafael Lima <52034564+rafael-lima-tw@users.noreply.github.com> Date: Fri, 23 Jul 2021 10:22:27 -0300 Subject: [PATCH 01/78] Update documentation with the new github tap functionality (#755) --- docs/connectors/taps/github.rst | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/docs/connectors/taps/github.rst b/docs/connectors/taps/github.rst index c4fa7b05e..d2b9162c7 100644 --- a/docs/connectors/taps/github.rst +++ b/docs/connectors/taps/github.rst @@ -33,6 +33,7 @@ Example YAML for ``tap-github``: name: "Github" # Name of the tap type: "tap-github" # !! THIS SHOULD NOT CHANGE !! owner: "somebody@foo.com" # Data owner to contact + sync_period: "*/90 * * * *" # Period in which the tap will run #send_alert: False # Optional: Disable all configured alerts on this tap @@ -41,8 +42,29 @@ Example YAML for ``tap-github``: # ------------------------------------------------------------------------------ db_conn: access_token: "" # Github access token with at least the repo scope - repository: "transferwise/pipelinewise" # Path to one or multiple repositories that you want to extract data from + organization: "gnome" # The organization you want to extract the data from + # Required when repos_include/repository isn't present + # OR + # Required when repos_exclude contains wildcard matchers + # OR + # Required when repos_include/repository contains wildcard matchers + repos_include: "gnome* polari" # Allow list strategy to extract selected repos data from organization. # Each repo path should be space delimited. + # Supports wildcard matching + # Values also valid: singer-io/tap-github another-org/tap-octopus + # Org prefix not allowed when organization is present + repos_exclude: "*tests* api-docs" # Deny list to extract all repos from organization except the ones listed. + # Each repo path should be space delimited. + # Supports wildcard matching + # Requires organization + # Org prefix not allowed in repos_exclude + repository: "gnome/gnome-software" # (DEPRECATED) Path to one or multiple repositories that you want to extract data from organization (has priority over repos_exclude)) + # Each repo path should be space delimited. + # Org prefix not allowed when organization is present + include_archived: false # Optional: true/false to include archived repos. Default false + include_disabled: false # Optional: true/false to include disabled repos. Default false + max_rate_limit_wait_seconds: 600 # Optional: Max time to wait if you hit the github api limit. Default to 600s + # ------------------------------------------------------------------------------ # Destination (Target) - Target properties From 9766a4d9fffdcf8db8675ad2b03d6f85440f03bc Mon Sep 17 00:00:00 2001 From: Rafael Lima <52034564+rafael-lima-tw@users.noreply.github.com> Date: Mon, 2 Aug 2021 10:08:38 -0300 Subject: [PATCH 02/78] Ee 000 update tap GitHub schema (#756) --- pipelinewise/cli/schemas/tap.json | 64 ++++++ pylintrc | 4 +- tests/units/cli/resources/tap-github.yml | 31 +++ tests/units/cli/test_cli_utils_tap_github.py | 212 +++++++++++++++++++ 4 files changed, 309 insertions(+), 2 deletions(-) create mode 100644 tests/units/cli/resources/tap-github.yml create mode 100644 tests/units/cli/test_cli_utils_tap_github.py diff --git a/pipelinewise/cli/schemas/tap.json b/pipelinewise/cli/schemas/tap.json index 6c665e350..575f52b28 100644 --- a/pipelinewise/cli/schemas/tap.json +++ b/pipelinewise/cli/schemas/tap.json @@ -232,6 +232,67 @@ } }, "required": ["type"] + }, + "is_tap_github": { + "required": ["type"], + "properties": { + "type": { + "enum": ["tap-github"] + } + } + }, + "tap_github": { + "anyOf": [ + { + "not": { + "$ref": "#/definitions/is_tap_github" + } + }, + { + "required": ["db_conn"], + "properties": { + "db_conn": { + "type": "object", + "required": ["access_token", "start_date"], + "properties": { + "access_token": { + "type": "string" + }, + "start_date": { + "type": "string", + "format": "date-time" + }, + "organization": { + "type": "string" + }, + "repos_include": { + "type": "string" + }, + "repos_exclude": { + "type": "string" + }, + "repository": { + "type": "string" + }, + "include_archived": { + "type": "boolean", + "default": false + }, + "include_disabled": { + "type": "boolean", + "default": false + }, + "max_rate_limit_wait_seconds": { + "type": "integer", + "default": 600, + "minimum": 600, + "maximum": 3600 + } + } + } + } + } + ] } }, "type": "object", @@ -304,6 +365,9 @@ "allOf": [ { "$ref": "#/definitions/tap_mongo_implies_ft_and_lb" + }, + { + "$ref": "#/definitions/tap_github" } ], "required": [ diff --git a/pylintrc b/pylintrc index 15e7bc7c8..67e5096d5 100644 --- a/pylintrc +++ b/pylintrc @@ -455,10 +455,10 @@ module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ # Regular expression matching correct method names -method-rgx=[a-z_][a-z0-9_]{2,30}$ +method-rgx=[a-z_][a-z0-9_]{2,80}$ # Naming hint for method names -method-name-hint=[a-z_][a-z0-9_]{2,30}$ +method-name-hint=[a-z_][a-z0-9_]{2,80}$ # Regular expression which should only match function or class names that do # not require a docstring. diff --git a/tests/units/cli/resources/tap-github.yml b/tests/units/cli/resources/tap-github.yml new file mode 100644 index 000000000..1a4f41c30 --- /dev/null +++ b/tests/units/cli/resources/tap-github.yml @@ -0,0 +1,31 @@ +id: "github" +name: "Github" +type: "tap-github" +owner: "somebody@foo.com" +sync_period: "*/90 * * * *" + +db_conn: + access_token: "" + start_date: "2021-07-14T00:00:00Z" + organization: "gnome" + repos_include: "gnome* polari" + repos_exclude: "*tests* api-docs" + repository: "gnome/gnome-software" + include_archived: false + include_disabled: false + max_rate_limit_wait_seconds: 600 + +target: "snowflake" +batch_size_rows: 20000 + +schemas: + - source_schema: "my_db" + target_schema: "repl_my_db" + target_schema_select_permissions: + - grp_stats + tables: + - table_name: "table_one" + replication_method: "INCREMENTAL" + replication_key: "last_update" + - table_name: "table_two" + replication_method: "LOG_BASED" diff --git a/tests/units/cli/test_cli_utils_tap_github.py b/tests/units/cli/test_cli_utils_tap_github.py new file mode 100644 index 000000000..6a85893bd --- /dev/null +++ b/tests/units/cli/test_cli_utils_tap_github.py @@ -0,0 +1,212 @@ +import os +import pytest + +from unittest import TestCase +import pipelinewise.cli as cli + +TAP_GITHUB_YAML = '{}/resources/tap-github.yml'.format(os.path.dirname(__file__)) + + +# pylint: disable=no-self-use,too-many-public-methods,fixme +class TestUtils(TestCase): + """ + Unit Tests for Tap Github PipelineWise CLI utility functions + """ + + def assert_json_is_invalid(self, schema, invalid_yaml): + """Simple assertion to check if validate function exits with error""" + with pytest.raises(SystemExit) as pytest_wrapped_e: + cli.utils.validate(invalid_yaml, schema) + + self.assertEqual(pytest_wrapped_e.type, SystemExit) + self.assertEqual(pytest_wrapped_e.value.code, 1) + + def test_should_pass_with_valid_json_schema(self): + """ + Test Should pass with valid json schema + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + self.assertIsNone(cli.utils.validate(actual_yaml, schema)) + + def test_should_pass_if_organization_and_repos_include_missing_but_repository_exists(self): + """ + Test should pass if organization and repos include missing but repository exists + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + del actual_yaml['db_conn']['organization'] + del actual_yaml['db_conn']['repos_include'] + + self.assertIsNone(cli.utils.validate(actual_yaml, schema)) + + def test_should_pass_if_organization_and_repository_missing_but_repos_include_exists(self): + """ + Test should pass if organization and repository missing but repos_include exists + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + del actual_yaml['db_conn']['organization'] + del actual_yaml['db_conn']['repository'] + + self.assertIsNone(cli.utils.validate(actual_yaml, schema)) + + # Todo: make schema pass this test scenario + # def test_should_fail_if_organization_and_repository_and_repos_include_missing(self): + # """ + # validation fails if organization, repository and repos include are all missing + # """ + # schema = cli.utils.load_schema('tap') + # + # actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + # del actual_yaml['db_conn']['organization'] + # del actual_yaml['db_conn']['repository'] + # del actual_yaml['db_conn']['repos_include'] + # + # self.assert_json_is_invalid(schema, actual_yaml) + + def test_should_fail_when_access_token_is_missing(self): + """ + Test Should fail when access token is missing + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + del actual_yaml['db_conn']['access_token'] + + self.assert_json_is_invalid(schema, actual_yaml) + + def test_should_fail_when_start_date_is_missing(self): + """ + Test should fail when start date is missing + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + del actual_yaml['db_conn']['start_date'] + + self.assert_json_is_invalid(schema, actual_yaml) + + def test_should_fail_when_access_token_is_not_string(self): + """ + Test should fail when access token is not string + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + actual_yaml['db_conn']['access_token'] = 123456 + + self.assert_json_is_invalid(schema, actual_yaml) + + def test_should_fail_when_start_date_is_not_string(self): + """ + Test should fail when start date is not string + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + actual_yaml['db_conn']['start_date'] = 123456 + + self.assert_json_is_invalid(schema, actual_yaml) + + def test_should_fail_when_organization_is_not_string(self): + """ + Test should fail when organization is not string + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + actual_yaml['db_conn']['organization'] = [] + + self.assert_json_is_invalid(schema, actual_yaml) + + def test_should_fail_when_repos_include_is_not_string(self): + """ + Test should fail when repos include is not string + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + actual_yaml['db_conn']['repos_include'] = [] + + self.assert_json_is_invalid(schema, actual_yaml) + + def test_should_fail_when_repos_exclude_is_not_string(self): + """ + Test should fail when repos exclude is not string + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + actual_yaml['db_conn']['repos_include'] = {} + + self.assert_json_is_invalid(schema, actual_yaml) + + def test_should_fail_when_repository_is_not_string(self): + """ + Test should fail when repository is not string + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + actual_yaml['db_conn']['repository'] = {} + + self.assert_json_is_invalid(schema, actual_yaml) + + def test_should_fail_when_include_archived_is_not_boolean(self): + """ + Test should fail when include archived is not boolean + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + actual_yaml['db_conn']['include_archived'] = 'false' + + self.assert_json_is_invalid(schema, actual_yaml) + + def test_should_fail_when_include_disabled_is_not_boolean(self): + """ + Test should fail when include disabled is not boolean + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + actual_yaml['db_conn']['include_archived'] = 'false' + + self.assert_json_is_invalid(schema, actual_yaml) + + def test_should_fail_when_max_rate_limit_wait_seconds_is_not_integer(self): + """ + Test should fail when max rate limit wait seconds is not integer + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + actual_yaml['db_conn']['max_rate_limit_wait_seconds'] = '111' + + self.assert_json_is_invalid(schema, actual_yaml) + + def test_should_fail_when_max_rate_limit_wait_seconds_is_above_max(self): + """ + Test should fail when max rate limit wait seconds is above the max + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + actual_yaml['db_conn']['max_rate_limit_wait_seconds'] = 4000 + + self.assert_json_is_invalid(schema, actual_yaml) + + def test_should_fail_when_max_rate_limit_wait_seconds_is_below_minx(self): + """ + Test should fail when max rate limit wait seconds is below the min + """ + schema = cli.utils.load_schema('tap') + + actual_yaml = cli.utils.load_yaml(TAP_GITHUB_YAML) + actual_yaml['db_conn']['max_rate_limit_wait_seconds'] = 30 + + self.assert_json_is_invalid(schema, actual_yaml) From e895d27b771f0b233164588511afe9d15dceeb29 Mon Sep 17 00:00:00 2001 From: Samira El Aabidi <54845154+Samira-El@users.noreply.github.com> Date: Tue, 3 Aug 2021 11:13:30 +0300 Subject: [PATCH 03/78] do not log json object (#764) --- pipelinewise/cli/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelinewise/cli/utils.py b/pipelinewise/cli/utils.py index 46c54b96a..d67484362 100644 --- a/pipelinewise/cli/utils.py +++ b/pipelinewise/cli/utils.py @@ -275,8 +275,8 @@ def validate(instance, schema): # Serialise vault encrypted objects to string schema_safe_inst = json.loads(json.dumps(instance, cls=AnsibleJSONEncoder)) jsonschema.validate(instance=schema_safe_inst, schema=schema) - except jsonschema.exceptions.ValidationError as exc: - LOGGER.critical('Invalid object %s', exc) + except jsonschema.exceptions.ValidationError: + LOGGER.critical('json object doesn\'t match schema %s', schema) sys.exit(1) From 164d13def246a631601cac5b5c5b4b03b1629bde Mon Sep 17 00:00:00 2001 From: Samira El Aabidi <54845154+Samira-El@users.noreply.github.com> Date: Tue, 3 Aug 2021 14:48:22 +0300 Subject: [PATCH 04/78] AP-1037 Optimise docker image and build a ci image (#765) --- .circleci/config.yml | 24 ++++++++++++++++++++++++ .dockerignore | 23 +++++++++++++++++++++-- Dockerfile | 20 ++++++++++---------- Dockerfile.ci | 13 +++++++++++++ install.sh | 10 ++++++++-- 5 files changed, 76 insertions(+), 14 deletions(-) create mode 100644 Dockerfile.ci diff --git a/.circleci/config.yml b/.circleci/config.yml index b536a9737..88a3d1a8f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -153,6 +153,21 @@ jobs: DEPLOY_DOCKERFILE: "./Dockerfile" DEPLOY_IMAGE_ADDITIONAL_TAGS: "latest" + upload_ci_docker_image: + <<: *docker_k8s_deployer + steps: + - checkout + - setup_remote_docker + - *attach_workspace + - run: + name: Push and release the new ci docker image to artifactory + command: k8s-deployment docker-build jfrog-cli-docker-push + environment: + DEPLOY_SLACK_CHANNEL: "#analytics-platform-builds" + DEPLOY_IMAGE_NAME: "pipelinewise-ci" + DEPLOY_DOCKERFILE: "./Dockerfile.ci" + DEPLOY_IMAGE_ADDITIONAL_TAGS: "latest" + promote_docker_image: <<: *docker_k8s_deployer steps: @@ -183,6 +198,15 @@ workflows: only: - master + - upload_ci_docker_image: + context: kubernetes-staging + requires: + - e2e_tests + filters: + branches: + only: + - master + - promote_docker_image: context: promote-build requires: diff --git a/.dockerignore b/.dockerignore index 06acfc141..3276c3701 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,10 +1,29 @@ **/.git **/.virtualenvs +**/.github **/.circleci **/.pytest_cache **/__pycache__ *.egg-info *.egg/ -docs -dev-project *.rpm +**/venv +**/.venv +**/.coverage + +bin +dev-project +docs +scripts +tests +test-reports + +.coveragerc +.pre-commit-config.yaml +.gitignore +.style.yapf +.yapfignore +CONTRIBUTING.md +CHANGELOG.md +pylintrc +pytest.ini diff --git a/Dockerfile b/Dockerfile index 49c126cd5..9ad1de1f1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,21 +1,21 @@ FROM python:3.7-slim-buster -RUN apt-get -qq update && apt-get -qqy install \ +ARG connectors=all +COPY . /app + +RUN apt-get -qq update \ + && apt-get -qqy --no-install-recommends install \ apt-utils \ alien \ libaio1 \ mongo-tools \ mbuffer \ wget \ - && pip install --upgrade pip - -ARG connectors=all -COPY . /app - -# Install Oracle Instant Client for tap-oracle if its in the connectors list -RUN bash -c "if grep -q \"tap-oracle\" <<< \"$connectors\"; then wget https://download.oracle.com/otn_software/linux/instantclient/193000/oracle-instantclient19.3-basiclite-19.3.0.0.0-1.x86_64.rpm -O /app/oracle-instantclient.rpm && alien -i /app/oracle-instantclient.rpm --scripts && rm -rf /app/oracle-instantclient.rpm ; fi" - -RUN cd /app \ + && rm -rf /var/lib/apt/lists/* \ + && pip install -U --no-cache-dir pip \ + # Install Oracle Instant Client for tap-oracle if its in the connectors list + && bash -c "if grep -q \"tap-oracle\" <<< \"$connectors\"; then wget https://download.oracle.com/otn_software/linux/instantclient/193000/oracle-instantclient19.3-basiclite-19.3.0.0.0-1.x86_64.rpm -O /app/oracle-instantclient.rpm && alien -i /app/oracle-instantclient.rpm --scripts && rm -rf /app/oracle-instantclient.rpm ; fi" \ + && cd /app \ && ./install.sh --connectors=$connectors --acceptlicenses --nousage --notestextras \ && ln -s /root/.pipelinewise /app/.pipelinewise diff --git a/Dockerfile.ci b/Dockerfile.ci new file mode 100644 index 000000000..d6f620011 --- /dev/null +++ b/Dockerfile.ci @@ -0,0 +1,13 @@ +FROM python:3.7-slim-buster + +COPY . /app + +RUN apt-get -qq update \ + && apt-get -qqy --no-install-recommends install apt-utils alien libaio1 wget \ + && rm -rf /var/lib/apt/lists/* \ + && pip install -U --no-cache-dir pip \ + && cd /app \ + && ./install.sh --connectors=none --acceptlicenses --nousage --notestextras \ + && ln -s /root/.pipelinewise /app/.pipelinewise + +ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/install.sh b/install.sh index 1bb5ca109..36b25d2a4 100755 --- a/install.sh +++ b/install.sh @@ -123,7 +123,7 @@ print_installed_connectors() { # Parse command line arguments for arg in "$@"; do case $arg in - # Auto accept license agreemnets. Useful if PipelineWise installed by an automated script + # Auto accept license agreements. Useful if PipelineWise installed by an automated script --acceptlicenses) ACCEPT_LICENSES="YES" ;; @@ -200,6 +200,9 @@ if [[ -z $CONNECTORS ]]; then install_connector $i done +# don't install any connectors if --connectors=none passed +elif [[ $CONNECTORS == "none" ]]; then + echo "No connectors will be installed" # Install every available connectors if --connectors=all passed elif [[ $CONNECTORS == "all" ]]; then @@ -227,7 +230,10 @@ echo "-------------------------------------------------------------------------- echo "PipelineWise installed successfully in $((end_time-start_time)) seconds" echo "--------------------------------------------------------------------------" -print_installed_connectors +if [[ $CONNECTORS != "none" ]]; then + print_installed_connectors +fi + if [[ $NO_USAGE != "YES" ]]; then echo echo "To start CLI:" From c4f858768c2a6b013f7d6ff07c74e8d573933e2f Mon Sep 17 00:00:00 2001 From: Diego Furtado Date: Wed, 4 Aug 2021 09:08:01 +0100 Subject: [PATCH 05/78] EE-1104 - 0.35.0v (#760) --- CHANGELOG.md | 13 +++++++++++++ README.md | 2 +- setup.py | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 55efc2e5b..1dd168ecd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,16 @@ +0.35.0 (2021-08-04) +------------------- +- Support `"none"` as a value for `--connectors` in `install.sh` script to install a stripped down Pipelinewise without any connectors. +- Optimize Dockerfile +- Do not log invalid json objects if they fail validation against json schema. +- Replace `github-tap` with fork `pipelinewise-tap-github` version `1.0.0` +- Add schema validation for github tap +- Increase batch_size_rows from 1M to 5M +- Increase split_file_chunk_size_mb from 2500 to 5000 +- Add latest tag to docker image +- Bump `pipelinewise-tap-s3-csv` from `1.2.1` to `1.2.2` +- Update pymongo requirement from `<3.12,>=3.10` to `>=3.10,<3.13` + 0.34.1 (2021-07-15) ------------------- - Bump `pipelinewise-target-snowflake` from `1.13.0` to `1.13.1` diff --git a/README.md b/README.md index 6394a421e..682f40b18 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ consumes data from taps and do something with it, like load it into a file, API | Tap | **[Google Analytics](https://github.com/transferwise/pipelinewise-tap-google-analytics)** | Extra | [![PyPI version](https://badge.fury.io/py/pipelinewise-tap-google-analytics.svg)](https://badge.fury.io/py/tap-adwords) | Extracts data from Google Analytics | | Tap | **[Oracle](https://github.com/transferwise/pipelinewise-tap-oracle)** | Extra | [![PyPI version](https://badge.fury.io/py/pipelinewise-tap-oracle.svg)](https://badge.fury.io/py/pipelinewise-tap-oracle) | Extracts data from Oracle databases. Supporting Log-Based, Key-Based Incremental and Full Table replications | | Tap | **[Zuora](https://github.com/transferwise/pipelinewise-tap-zuora)** | Extra | [![PyPI version](https://badge.fury.io/py/pipelinewise-tap-zuora.svg)](https://badge.fury.io/py/pipelinewise-tap-zuora) | Extracts data from Zuora database using AQAA and REST extraction API with Key-Based incremental replications | -| Tap | **[GitHub](https://github.com/singer-io/tap-github)** | | [![PyPI version](https://badge.fury.io/py/tap-github.svg)](https://badge.fury.io/py/tap-github) | Extracts data from GitHub API using Personal Access Token and Key-Based incremental replications | +| Tap | **[GitHub](https://github.com/transferwise/pipelinewise-tap-github)** | | [![PyPI version](https://badge.fury.io/py/pipelinewise-tap-github.svg)](https://badge.fury.io/py/pipelinewise-tap-github) | Extracts data from GitHub API using Personal Access Token and Key-Based incremental replications | | Tap | **[Shopify](https://github.com/singer-io/tap-shopify)** | Extra | [![PyPI version](https://badge.fury.io/py/tap-shopify.svg)](https://badge.fury.io/py/tap-shopify) | Extracts data from Shopify API using Personal App API Password and date based incremental replications | | Tap | **[Slack](https://github.com/transferwise/pipelinewise-tap-slack)** | | [![PyPI version](https://badge.fury.io/py/pipelinewise-tap-slack.svg)](https://badge.fury.io/py/pipelinewise-tap-slack) | Extracts data from a Slack API using Bot User Token and Key-Based incremental replications | | Tap | **[Mixpanel](https://github.com/transferwise/pipelinewise-tap-mixpanel)** | | [![PyPI version](https://badge.fury.io/py/pipelinewise-tap-mixpanel.svg)](https://badge.fury.io/py/pipelinewise-tap-mixpanel) | Extracts data from the Mixpanel API. | diff --git a/setup.py b/setup.py index 4803cd0ab..8babe7c90 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ LONG_DESCRIPTION = f.read() setup(name='pipelinewise', - version='0.34.1', + version='0.35.0', description='PipelineWise', long_description=LONG_DESCRIPTION, long_description_content_type='text/markdown', From 604f6d466c3354d5754bbf9efbc10cb3bc947596 Mon Sep 17 00:00:00 2001 From: Peter Kosztolanyi Date: Mon, 9 Aug 2021 15:52:13 +0100 Subject: [PATCH 06/78] Bump tap-github from 1.0.0 to 1.0.1 (#767) --- singer-connectors/tap-github/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singer-connectors/tap-github/requirements.txt b/singer-connectors/tap-github/requirements.txt index ed1090806..f919e5a32 100644 --- a/singer-connectors/tap-github/requirements.txt +++ b/singer-connectors/tap-github/requirements.txt @@ -1 +1 @@ -pipelinewise-tap-github==1.0.0 +pipelinewise-tap-github==1.0.1 From 938a609d3e1adf86d43eedffb8cad9acac5e7149 Mon Sep 17 00:00:00 2001 From: Peter Kosztolanyi Date: Fri, 13 Aug 2021 12:12:05 +0200 Subject: [PATCH 07/78] v0.35.1 (#769) --- CHANGELOG.md | 7 +++++++ docs/conf.py | 2 +- setup.py | 2 +- singer-connectors/tap-jira/requirements.txt | 2 +- singer-connectors/tap-kafka/requirements.txt | 2 +- singer-connectors/target-s3-csv/requirements.txt | 2 +- 6 files changed, 12 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dd168ecd..180888133 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +0.35.1 (2021-08-13) +------------------- +- Bump `pipelinewise-tap-github` from `1.0.0` to `1.0.1` +- Bump `pipelinewise-tap-kafka` from `4.0.0` to `4.0.1` +- Bump `tap-jira` from `2.0.0` to `2.0.1` +- Bump `pipelinewise-target-s3-csv` from `1.4.0` to `1.5.0` + 0.35.0 (2021-08-04) ------------------- - Support `"none"` as a value for `--connectors` in `install.sh` script to install a stripped down Pipelinewise without any connectors. diff --git a/docs/conf.py b/docs/conf.py index 3cb714e3e..f3ffc0f7c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -25,7 +25,7 @@ def setup(app): project = 'PipelineWise' copyright = f'{datetime.datetime.now().year}, Wise Ltd.' author = 'Wise' -version = '0.34.0' +version = '0.35.1' # -- General configuration --------------------------------------------------- diff --git a/setup.py b/setup.py index 8babe7c90..4c168471b 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ LONG_DESCRIPTION = f.read() setup(name='pipelinewise', - version='0.35.0', + version='0.35.1', description='PipelineWise', long_description=LONG_DESCRIPTION, long_description_content_type='text/markdown', diff --git a/singer-connectors/tap-jira/requirements.txt b/singer-connectors/tap-jira/requirements.txt index ec8ace747..38e8dda4a 100644 --- a/singer-connectors/tap-jira/requirements.txt +++ b/singer-connectors/tap-jira/requirements.txt @@ -1 +1 @@ -tap-jira==2.0.0 +tap-jira==2.0.1 diff --git a/singer-connectors/tap-kafka/requirements.txt b/singer-connectors/tap-kafka/requirements.txt index 493da5f1d..2d30a0cf5 100644 --- a/singer-connectors/tap-kafka/requirements.txt +++ b/singer-connectors/tap-kafka/requirements.txt @@ -1 +1 @@ -pipelinewise-tap-kafka==4.0.0 +pipelinewise-tap-kafka==4.0.1 diff --git a/singer-connectors/target-s3-csv/requirements.txt b/singer-connectors/target-s3-csv/requirements.txt index 8812bfac6..72ee8686f 100644 --- a/singer-connectors/target-s3-csv/requirements.txt +++ b/singer-connectors/target-s3-csv/requirements.txt @@ -1 +1 @@ -pipelinewise-target-s3-csv==1.4.0 +pipelinewise-target-s3-csv==1.5.0 From e5ce034408fce7a98163f9fccc5118ca26d20dab Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Aug 2021 11:43:43 +0200 Subject: [PATCH 08/78] Bump jinja2 from 2.11.3 to 3.0.1 (#719) Bumps [jinja2](https://github.com/pallets/jinja) from 2.11.3 to 3.0.1. - [Release notes](https://github.com/pallets/jinja/releases) - [Changelog](https://github.com/pallets/jinja/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/jinja/compare/2.11.3...3.0.1) --- updated-dependencies: - dependency-name: jinja2 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Peter Kosztolanyi --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4c168471b..ce86f6552 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ 'tabulate==0.8.9', 'PyYAML==5.4.1', 'ansible==4.2.0', - 'Jinja2==2.11.3', + 'Jinja2==3.0.1', 'joblib==1.0.0', 'PyMySQL==0.7.11', 'psycopg2-binary==2.8.6', From 19ad08c9f1a004ef784c9aee82dae4b6b2f601f6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Aug 2021 12:44:48 +0200 Subject: [PATCH 09/78] Bump pre-commit from 2.13.0 to 2.14.0 (#776) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ce86f6552..9d80c3a52 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ 'singer-encodings==0.0.*', 'messytables==0.15.*', 'python-pidfile==3.0.0', - 'pre-commit==2.13.0', + 'pre-commit==2.14.0', 'pymongo>=3.10,<3.13', 'tzlocal>=2.0,<2.2', 'slackclient>=2.7,<2.10', From b5829d8f2a7ef076b4aaa8df149d69b5fdb3f209 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Aug 2021 15:06:59 +0200 Subject: [PATCH 10/78] Bump python-dotenv from 0.18.0 to 0.19.0 (#774) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9d80c3a52..5120880aa 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ 'pytest==6.2.4', 'pytest-dependency==0.4.0', 'pytest-cov==2.12.1', - 'python-dotenv==0.18.0', + 'python-dotenv==0.19.0', 'mock==4.0.3', 'pylint==2.8.3', 'unify==0.5' From 8a582f12f06cd6d2afa501c7eff5ed9e39f47c8b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 17 Aug 2021 10:20:39 +0200 Subject: [PATCH 11/78] Bump ansible from 4.2.0 to 4.4.0 (#773) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5120880aa..623143b2e 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ 'argparse==1.4.0', 'tabulate==0.8.9', 'PyYAML==5.4.1', - 'ansible==4.2.0', + 'ansible==4.4.0', 'Jinja2==3.0.1', 'joblib==1.0.0', 'PyMySQL==0.7.11', From 6aff16312cd669bb5e98234e7110da919507c999 Mon Sep 17 00:00:00 2001 From: Peter Kosztolanyi Date: Tue, 17 Aug 2021 15:02:17 +0200 Subject: [PATCH 12/78] Bump pipelinewise-tap-github from 1.0.1 to 1.0.2 (#777) --- singer-connectors/tap-github/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singer-connectors/tap-github/requirements.txt b/singer-connectors/tap-github/requirements.txt index f919e5a32..1830c2385 100644 --- a/singer-connectors/tap-github/requirements.txt +++ b/singer-connectors/tap-github/requirements.txt @@ -1 +1 @@ -pipelinewise-tap-github==1.0.1 +pipelinewise-tap-github==1.0.2 From 1e90571c6d2221675d0acaf246b8294bafa0baf1 Mon Sep 17 00:00:00 2001 From: Peter Kosztolanyi Date: Tue, 17 Aug 2021 15:57:21 +0200 Subject: [PATCH 13/78] v0.35.2 (#778) --- CHANGELOG.md | 5 +++++ docs/conf.py | 2 +- setup.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 180888133..63f60dc7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +0.35.2 (2021-08-17) +------------------- +- Bump `pipelinewise-tap-github` from `1.0.1` to `1.0.2` +- Update a few vulnerable or outdated dependencies to latest + 0.35.1 (2021-08-13) ------------------- - Bump `pipelinewise-tap-github` from `1.0.0` to `1.0.1` diff --git a/docs/conf.py b/docs/conf.py index f3ffc0f7c..a97e64c18 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -25,7 +25,7 @@ def setup(app): project = 'PipelineWise' copyright = f'{datetime.datetime.now().year}, Wise Ltd.' author = 'Wise' -version = '0.35.1' +version = '0.35.2' # -- General configuration --------------------------------------------------- diff --git a/setup.py b/setup.py index 623143b2e..9928e7ee9 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ LONG_DESCRIPTION = f.read() setup(name='pipelinewise', - version='0.35.1', + version='0.35.2', description='PipelineWise', long_description=LONG_DESCRIPTION, long_description_content_type='text/markdown', From 37e9c1796681210126e674c4e465ce2599a5cefb Mon Sep 17 00:00:00 2001 From: Peter Kosztolanyi Date: Wed, 25 Aug 2021 14:12:14 +0200 Subject: [PATCH 14/78] Bumpy pylint from 2.8.3 to 2.10.2 (#784) --- pipelinewise/cli/alert_sender.py | 2 +- pipelinewise/cli/commands.py | 2 +- pipelinewise/cli/config.py | 20 ++++++++++--------- pipelinewise/cli/pipelinewise.py | 10 ++++++---- pipelinewise/cli/utils.py | 18 ++++++++++------- pipelinewise/fastsync/commons/split_gzip.py | 2 +- .../fastsync/commons/target_snowflake.py | 2 +- .../fastsync/commons/transform_utils.py | 5 +++++ pipelinewise/fastsync/commons/utils.py | 4 ++-- setup.py | 2 +- tests/end_to_end/helpers/assertions.py | 6 +++--- tests/end_to_end/helpers/env.py | 4 ++-- tests/end_to_end/test_target_snowflake.py | 3 +-- tests/units/cli/test_alert_sender.py | 3 +-- tests/units/cli/test_cli.py | 2 +- tests/units/cli/test_cli_utils.py | 2 +- tests/units/cli/test_cli_utils_tap_github.py | 3 ++- tests/units/cli/test_commands.py | 2 +- tests/units/cli/test_config.py | 2 +- .../commons/test_fastsync_tap_mysql.py | 2 +- 20 files changed, 54 insertions(+), 42 deletions(-) diff --git a/pipelinewise/cli/alert_sender.py b/pipelinewise/cli/alert_sender.py index b51c4208d..88a4ce7e9 100644 --- a/pipelinewise/cli/alert_sender.py +++ b/pipelinewise/cli/alert_sender.py @@ -39,7 +39,7 @@ class AlertSender: def __init__(self, alert_handlers: Dict = None) -> None: # Initialise alert_handlers as empty dictionary if None provided if not alert_handlers: - self.alert_handlers = dict() + self.alert_handlers = {} else: self.alert_handlers = alert_handlers diff --git a/pipelinewise/cli/commands.py b/pipelinewise/cli/commands.py index 8fa0bf162..e824987cb 100644 --- a/pipelinewise/cli/commands.py +++ b/pipelinewise/cli/commands.py @@ -338,7 +338,7 @@ def run_command(command: str, log_file: str = None, line_callback: callable = No # Start command with Popen(shlex.split(piped_command), stdout=PIPE, stderr=STDOUT) as proc: - with open(log_file_running, 'a+') as logfile: + with open(log_file_running, 'a+', encoding='utf-8') as logfile: stdout = '' while True: line = proc.stdout.readline() diff --git a/pipelinewise/cli/config.py b/pipelinewise/cli/config.py index 7ba093fb2..bccdad371 100644 --- a/pipelinewise/cli/config.py +++ b/pipelinewise/cli/config.py @@ -22,8 +22,8 @@ def __init__(self, config_dir): self.logger = logging.getLogger(__name__) self.config_dir = config_dir self.config_path = os.path.join(self.config_dir, 'config.json') - self.global_config = dict() - self.targets = dict() + self.global_config = {} + self.targets = {} @classmethod # pylint: disable=too-many-locals @@ -54,6 +54,7 @@ def from_yamls(cls, config_dir, yaml_dir='.', vault_secret=None): utils.validate(instance=global_config, schema=global_config_schema) config.global_config = global_config or {} + # pylint: disable=E1136,E1137 # False positive when loading vault encrypted YAML # Load every target yaml into targets dictionary for yaml_file in target_yamls: config.logger.info('LOADING TARGET: %s', yaml_file) @@ -102,8 +103,8 @@ def from_yamls(cls, config_dir, yaml_dir='.', vault_secret=None): taps[tap_id] = tap_data # Link taps to targets - for target_key in targets: - targets[target_key]['taps'] = [tap for tap in taps.values() if tap['target'] == target_key] + for target_key, target in targets.items(): + target['taps'] = [tap for tap in taps.values() if tap['target'] == target_key] # Final structure is ready config.targets = targets @@ -173,9 +174,10 @@ def save_main_config_json(self): targets = [] # Generate dictionary for config.json - for key in self.targets: + for target_tuple in self.targets.items(): + target = target_tuple[1] taps = [] - for tap in self.targets[key].get('taps'): + for tap in target.get('taps'): taps.append({ 'id': tap.get('id'), 'name': tap.get('name'), @@ -187,10 +189,10 @@ def save_main_config_json(self): }) targets.append({ - 'id': self.targets[key].get('id'), - 'name': self.targets[key].get('name'), + 'id': target.get('id'), + 'name': target.get('name'), 'status': 'ready', - 'type': self.targets[key].get('type'), + 'type': target.get('type'), 'taps': taps }) main_config = {**self.global_config, **{'targets': targets}} diff --git a/pipelinewise/cli/pipelinewise.py b/pipelinewise/cli/pipelinewise.py index 5db67826f..db1abd7e4 100644 --- a/pipelinewise/cli/pipelinewise.py +++ b/pipelinewise/cli/pipelinewise.py @@ -842,7 +842,7 @@ def update_state_file(line: str) -> str: nonlocal start, state if start is None or time() - start >= 2: - with open(tap.state, 'w') as state_file: + with open(tap.state, 'w', encoding='utf-8') as state_file: state_file.write(line) # Update start time to be the current time. @@ -870,7 +870,7 @@ def update_state_file_with_extra_log(line: str) -> str: # update the state file one last time to make sure it always has the last state message. if state is not None: - with open(tap.state, 'w') as statefile: + with open(tap.state, 'w', encoding='utf-8') as statefile: statefile.write(state) def run_tap_fastsync(self, tap: TapParams, target: TargetParams, transform: TransformParams): @@ -1074,7 +1074,7 @@ def stop_tap(self): """ pidfile_path = self.tap['files']['pidfile'] try: - with open(pidfile_path) as pidf: + with open(pidfile_path, encoding='utf-8') as pidf: pid = int(pidf.read()) parent = psutil.Process(pid) @@ -1218,6 +1218,7 @@ def validate(self): vault_secret = self.args.secret target_ids = set() + # pylint: disable=E1136,E1137 # False positive when loading vault encrypted YAML # Validate target json schemas and that no duplicate IDs exist for yaml_file in target_yamls: self.logger.info('Started validating %s', yaml_file) @@ -1232,6 +1233,7 @@ def validate(self): self.logger.info('Finished validating %s', yaml_file) tap_ids = set() + # pylint: disable=E1136,E1137 # False positive when loading vault encrypted YAML # Validate tap json schemas, check that every tap has valid 'target' and that no duplicate IDs exist for yaml_file in tap_yamls: self.logger.info('Started validating %s', yaml_file) @@ -1393,7 +1395,7 @@ def _print_tap_run_summary(self, status, start_time, end_time): # Append the summary to the right log file if log_file_to_write_summary: - with open(log_file_to_write_summary, 'a') as logfile: + with open(log_file_to_write_summary, 'a', encoding='utf-8') as logfile: logfile.write(summary) # pylint: disable=unused-variable diff --git a/pipelinewise/cli/utils.py b/pipelinewise/cli/utils.py index d67484362..1d3dc4ff0 100644 --- a/pipelinewise/cli/utils.py +++ b/pipelinewise/cli/utils.py @@ -25,7 +25,7 @@ from ansible.parsing.dataloader import DataLoader from ansible.parsing.vault import (VaultLib, get_file_vault_secret, is_encrypted_file) from ansible.parsing.yaml.loader import AnsibleLoader -from ansible.parsing.yaml.objects import AnsibleVaultEncryptedUnicode +from ansible.parsing.yaml.objects import AnsibleMapping, AnsibleVaultEncryptedUnicode from . import tap_properties @@ -74,7 +74,7 @@ def is_json_file(path): """ try: if os.path.isfile(path): - with open(path) as jsonfile: + with open(path, encoding='utf-8') as jsonfile: if json.load(jsonfile): return True return False @@ -89,7 +89,7 @@ def load_json(path): try: LOGGER.debug('Parsing file at %s', path) if os.path.isfile(path): - with open(path) as jsonfile: + with open(path, encoding='utf-8') as jsonfile: return json.load(jsonfile) else: LOGGER.debug('No file at %s', path) @@ -115,7 +115,7 @@ def save_json(data, path): """ try: LOGGER.debug('Saving JSON %s', path) - with open(path, 'w') as jsonfile: + with open(path, 'w', encoding='utf-8') as jsonfile: return json.dump(data, jsonfile, cls=AnsibleJSONEncoder, indent=4, sort_keys=True) except Exception as exc: raise Exception(f'Cannot save JSON {path} {exc}') from exc @@ -138,7 +138,7 @@ def is_yaml_file(path): """ try: if os.path.isfile(path): - with open(path) as yamlfile: + with open(path, encoding='utf-8') as yamlfile: if yaml.safe_load(yamlfile): return True return False @@ -182,7 +182,7 @@ def load_yaml(yaml_file, vault_secret=None): data = None if os.path.isfile(yaml_file): - with open(yaml_file, 'r') as stream: + with open(yaml_file, 'r', encoding='utf-8') as stream: # Render environment variables using jinja templates contents = stream.read() template = Template(contents) @@ -204,6 +204,9 @@ def load_yaml(yaml_file, vault_secret=None): else: LOGGER.debug('No file at %s', yaml_file) + if isinstance(data, AnsibleMapping): + data = dict(data) + return data @@ -295,6 +298,7 @@ def delete_keys_from_dict(dic, keys): return dic if isinstance(dic, list): return [v for v in (delete_keys_from_dict(v, keys) for v in dic) if v] + # pylint: disable=C0325 # False positive on tuples return {k: v for k, v in ((k, delete_keys_from_dict(v, keys)) for k, v in dic.items()) if k not in keys} @@ -502,7 +506,7 @@ def find_errors_in_log_file(file, max_errors=10, error_pattern=None): errors = [] if file and os.path.isfile(file): - with open(file) as file_object: + with open(file, encoding='utf-8') as file_object: for line in file_object: if len(re.findall(error_pattern, line)) > 0: errors.append(line) diff --git a/pipelinewise/fastsync/commons/split_gzip.py b/pipelinewise/fastsync/commons/split_gzip.py index 9905d5eb9..8d7838915 100644 --- a/pipelinewise/fastsync/commons/split_gzip.py +++ b/pipelinewise/fastsync/commons/split_gzip.py @@ -109,7 +109,7 @@ def _activate_chunk_file(self): if self.compress: self.chunk_file = gzip.open(self.chunk_filename, self.mode) else: - self.chunk_file = builtins.open(self.chunk_filename, self.mode) + self.chunk_file = builtins.open(self.chunk_filename, self.mode, encoding='utf-8') @staticmethod def _bytes_to_megabytes(size: int) -> float: diff --git a/pipelinewise/fastsync/commons/target_snowflake.py b/pipelinewise/fastsync/commons/target_snowflake.py index f5ee65d3c..685490b46 100644 --- a/pipelinewise/fastsync/commons/target_snowflake.py +++ b/pipelinewise/fastsync/commons/target_snowflake.py @@ -115,7 +115,7 @@ def upload_to_s3(self, file, tmp_dir=None): ) # Upload to s3 - extra_args = {'ACL': s3_acl} if s3_acl else dict() + extra_args = {'ACL': s3_acl} if s3_acl else {} # Send key and iv in the metadata, that will be required to decrypt and upload the encrypted file extra_args['Metadata'] = { diff --git a/pipelinewise/fastsync/commons/transform_utils.py b/pipelinewise/fastsync/commons/transform_utils.py index 164af4b35..6dc9a300c 100644 --- a/pipelinewise/fastsync/commons/transform_utils.py +++ b/pipelinewise/fastsync/commons/transform_utils.py @@ -119,6 +119,7 @@ def get_trans_in_sql_flavor( return trans_map @classmethod + # pylint: disable=W0238 # False positive when it is used by another classmethod def __conditions_to_sql( cls, transform_conditions: List[Dict], @@ -187,6 +188,7 @@ def __conditions_to_sql( return ' AND '.join(conditions) @classmethod + # pylint: disable=W0238 # False positive when it is used by another classmethod def __safe_column(cls, col: str, sql_flavor: SQLFlavor): # Make the field id safe in case it's a reserved word if sql_flavor == SQLFlavor.SNOWFLAKE: @@ -204,6 +206,7 @@ def __safe_column(cls, col: str, sql_flavor: SQLFlavor): return column @classmethod + # pylint: disable=W0238 # False positive when it is used by another classmethod def __hash_to_sql(cls, column: str, sql_flavor: SQLFlavor) -> str: """ convert HASH transformation into the right sql string @@ -231,6 +234,7 @@ def __hash_to_sql(cls, column: str, sql_flavor: SQLFlavor) -> str: return trans @classmethod + # pylint: disable=W0238 # False positive when it is used by another classmethod def __hash_skip_first_to_sql(cls, transform_type: TransformationType, column: str, sql_flavor: SQLFlavor) -> str: """ convert HASH-SKIP-FIRST-n transformation into the right sql string @@ -261,6 +265,7 @@ def __hash_skip_first_to_sql(cls, transform_type: TransformationType, column: st return trans @classmethod + # pylint: disable=W0238 # False positive when it is used by another classmethod def __mask_date_to_sql(cls, column: str, sql_flavor: SQLFlavor) -> str: """ convert MASK-DATE transformation into the right sql string diff --git a/pipelinewise/fastsync/commons/utils.py b/pipelinewise/fastsync/commons/utils.py index 20729c66a..3beb3efdb 100644 --- a/pipelinewise/fastsync/commons/utils.py +++ b/pipelinewise/fastsync/commons/utils.py @@ -35,13 +35,13 @@ def get_cpu_cores(): def load_json(path): - with open(path) as fil: + with open(path, encoding='utf-8') as fil: return json.load(fil) def save_dict_to_json(path, data): LOGGER.info('Saving new state file to %s', path) - with open(path, 'w') as fil: + with open(path, 'w', encoding='utf-8') as fil: fil.write(json.dumps(data)) diff --git a/setup.py b/setup.py index 9928e7ee9..bb0090e14 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ 'pytest-cov==2.12.1', 'python-dotenv==0.19.0', 'mock==4.0.3', - 'pylint==2.8.3', + 'pylint==2.10.2', 'unify==0.5' ] }, diff --git a/tests/end_to_end/helpers/assertions.py b/tests/end_to_end/helpers/assertions.py index 8cf316b8b..810129cde 100644 --- a/tests/end_to_end/helpers/assertions.py +++ b/tests/end_to_end/helpers/assertions.py @@ -57,7 +57,7 @@ def assert_command_success(return_code, stdout, stderr, log_path=None): failed_log_path = f'{log_path}.failed' # Load failed log file if exists if os.path.isfile(failed_log_path): - with open(failed_log_path, 'r') as file: + with open(failed_log_path, 'r', encoding='utf-8') as file: failed_log = file.read() print(f'STDOUT: {stdout}\nSTDERR: {stderr}\nFAILED LOG: {failed_log}') @@ -81,14 +81,14 @@ def assert_state_file_valid(target_name, tap_name, log_path=None): if log_path: success_log_path = f'{log_path}.success' state_in_log = None - with open(success_log_path, 'r') as log_f: + with open(success_log_path, 'r', encoding='utf-8') as log_f: state_log_pattern = re.search(r'\nINFO STATE emitted from target: (.+\n)', '\n'.join(log_f.readlines())) if state_log_pattern: state_in_log = state_log_pattern.groups()[-1] # If the emitted state message exists in the log then compare it to the actual state file if state_in_log: - with open(state_file, 'r') as state_f: + with open(state_file, 'r', encoding='utf-8') as state_f: assert state_in_log == ''.join(state_f.readlines()) diff --git a/tests/end_to_end/helpers/env.py b/tests/end_to_end/helpers/env.py index fddd9c26a..a4a88e818 100644 --- a/tests/end_to_end/helpers/env.py +++ b/tests/end_to_end/helpers/env.py @@ -277,7 +277,7 @@ def _init_test_project_dir(self, project_dir): templates = glob.glob(f'{project_dir}/*.yml.template') for template_path in templates: # Replace env vars in template - with open(template_path, 'r') as f_template: + with open(template_path, 'r', encoding='utf-8') as f_template: yaml = f_template.read() # Detect if every env var configured for the template @@ -293,7 +293,7 @@ def _init_test_project_dir(self, project_dir): yaml = yaml.replace(f'${{{var}}}', self._all_env_vars_to_dict().get(var)) # Write the template replaced YAML file - with open(yaml_path, 'w+') as f_render: + with open(yaml_path, 'w+', encoding='utf-8') as f_render: f_render.write(yaml) # Delete if exists but not configured diff --git a/tests/end_to_end/test_target_snowflake.py b/tests/end_to_end/test_target_snowflake.py index bdf8143cb..0d0015832 100644 --- a/tests/end_to_end/test_target_snowflake.py +++ b/tests/end_to_end/test_target_snowflake.py @@ -262,13 +262,12 @@ def test_replicate_pg_to_sf_with_archive_load_files(self): 'public.country': 1, # FULL_TABLE : fastsync only 'public2.wearehere': 1 # FULL_TABLE : fastsync only } - for schema_table in expected_archive_files_count: + for schema_table, expected_archive_files in expected_archive_files_count.items(): schema, table = schema_table.split('.') files_in_s3_archive = s3_client.list_objects( Bucket=s3_bucket, Prefix=('{}/postgres_to_sf_archive_load_files/{}'.format(archive_s3_prefix, table))).get('Contents') - expected_archive_files = expected_archive_files_count[schema_table] if files_in_s3_archive is None or len(files_in_s3_archive) != expected_archive_files: raise Exception('files_in_archive for {} is {}. Expected archive files count: {}'.format( table, diff --git a/tests/units/cli/test_alert_sender.py b/tests/units/cli/test_alert_sender.py index 9ca7ad852..344cdfcd0 100644 --- a/tests/units/cli/test_alert_sender.py +++ b/tests/units/cli/test_alert_sender.py @@ -3,8 +3,7 @@ from unittest.mock import patch from slack.errors import SlackApiError -import pipelinewise.cli.alert_handlers.errors as errors - +from pipelinewise.cli.alert_handlers import errors from pipelinewise.cli.alert_sender import AlertHandler, AlertSender from pipelinewise.cli.alert_handlers.slack_alert_handler import SlackAlertHandler from pipelinewise.cli.alert_handlers.victorops_alert_handler import VictoropsAlertHandler diff --git a/tests/units/cli/test_cli.py b/tests/units/cli/test_cli.py index 31dc805a6..e426edf9c 100644 --- a/tests/units/cli/test_cli.py +++ b/tests/units/cli/test_cli.py @@ -7,10 +7,10 @@ import pidfile from pathlib import Path -import pipelinewise.cli as cli import pytest from unittest.mock import patch from tests.units.cli.cli_args import CliArgs +from pipelinewise import cli from pipelinewise.cli.pipelinewise import PipelineWise RESOURCES_DIR = '{}/resources'.format(os.path.dirname(__file__)) diff --git a/tests/units/cli/test_cli_utils.py b/tests/units/cli/test_cli_utils.py index 6d885699a..20039c619 100644 --- a/tests/units/cli/test_cli_utils.py +++ b/tests/units/cli/test_cli_utils.py @@ -1,7 +1,7 @@ import os import re -import pipelinewise.cli as cli +from pipelinewise import cli import pytest VIRTUALENVS_DIR = './virtualenvs-dummy' diff --git a/tests/units/cli/test_cli_utils_tap_github.py b/tests/units/cli/test_cli_utils_tap_github.py index 6a85893bd..5f6db4d5a 100644 --- a/tests/units/cli/test_cli_utils_tap_github.py +++ b/tests/units/cli/test_cli_utils_tap_github.py @@ -2,12 +2,13 @@ import pytest from unittest import TestCase -import pipelinewise.cli as cli +from pipelinewise import cli TAP_GITHUB_YAML = '{}/resources/tap-github.yml'.format(os.path.dirname(__file__)) # pylint: disable=no-self-use,too-many-public-methods,fixme +# pylint: disable=E1136,E1137 # False positive when loading vault encrypted YAML class TestUtils(TestCase): """ Unit Tests for Tap Github PipelineWise CLI utility functions diff --git a/tests/units/cli/test_commands.py b/tests/units/cli/test_commands.py index 0746c1118..7d70669ca 100644 --- a/tests/units/cli/test_commands.py +++ b/tests/units/cli/test_commands.py @@ -1,8 +1,8 @@ import os import sys import pytest -import pipelinewise.cli.commands as commands +from pipelinewise.cli import commands from pipelinewise.cli.errors import StreamBufferTooLargeException diff --git a/tests/units/cli/test_config.py b/tests/units/cli/test_config.py index b3517b0b9..70b786c66 100644 --- a/tests/units/cli/test_config.py +++ b/tests/units/cli/test_config.py @@ -1,7 +1,7 @@ import os import shutil -import pipelinewise.cli as cli +from pipelinewise import cli import pytest PIPELINEWISE_TEST_HOME = '/tmp/.pipelinewise' diff --git a/tests/units/fastsync/commons/test_fastsync_tap_mysql.py b/tests/units/fastsync/commons/test_fastsync_tap_mysql.py index 75377d4b0..eafe4f616 100644 --- a/tests/units/fastsync/commons/test_fastsync_tap_mysql.py +++ b/tests/units/fastsync/commons/test_fastsync_tap_mysql.py @@ -2,7 +2,7 @@ from unittest.mock import patch import pymysql -import pipelinewise.fastsync.commons.tap_mysql as tap_mysql +from pipelinewise.fastsync.commons import tap_mysql from pipelinewise.fastsync.commons.tap_mysql import FastSyncTapMySql From ca92f8d40b716a6b8bc06c581bb3e9a65e19a443 Mon Sep 17 00:00:00 2001 From: Samira El Aabidi <54845154+Samira-El@users.noreply.github.com> Date: Mon, 30 Aug 2021 10:28:40 +0300 Subject: [PATCH 15/78] AP-1045 Issue-578 Fastync Mongodb-Reshift pair doesn't exit - Fix Fastync pairing logic (#787) --- docs/concept/fastsync.rst | 9 ++- pipelinewise/cli/config.py | 5 +- pipelinewise/cli/constants.py | 17 +++++ pipelinewise/cli/pipelinewise.py | 109 ++++++++++++++++++------------- pipelinewise/fastsync/README.md | 19 ++---- tests/units/cli/test_cli.py | 80 +++++++++++++++-------- tests/units/cli/test_config.py | 48 +++++++++----- 7 files changed, 183 insertions(+), 104 deletions(-) create mode 100644 pipelinewise/cli/constants.py diff --git a/docs/concept/fastsync.rst b/docs/concept/fastsync.rst index 179e2bf24..d5acd5be5 100644 --- a/docs/concept/fastsync.rst +++ b/docs/concept/fastsync.rst @@ -45,4 +45,11 @@ Fast Sync exists only between the following tap and target components: +----------------------------+----------------------------------+ | :ref:`tap-mongodb` | **->** :ref:`target-postgres` | +----------------------------+----------------------------------+ - +| :ref:`tap-mysql` | **->** :ref:`target-bigquery` | ++----------------------------+----------------------------------+ +| :ref:`tap-postgres` | **->** :ref:`target-bigquery` | ++----------------------------+----------------------------------+ +| :ref:`tap-s3-csv` | **->** :ref:`target-bigquery` | ++----------------------------+----------------------------------+ +| :ref:`tap-mongodb` | **->** :ref:`target-bigquery` | ++----------------------------+----------------------------------+ diff --git a/pipelinewise/cli/config.py b/pipelinewise/cli/config.py index bccdad371..bd1f3ea0e 100644 --- a/pipelinewise/cli/config.py +++ b/pipelinewise/cli/config.py @@ -6,6 +6,8 @@ import sys import json +from typing import Dict + from pipelinewise.utils import safe_column_name from . import utils @@ -130,7 +132,7 @@ def get_tap_dir(self, target_id, tap_id): return os.path.join(self.config_dir, target_id, tap_id) @staticmethod - def get_connector_files(connector_dir): + def get_connector_files(connector_dir: str) -> Dict: """ Returns the absolute paths of a tap/target configuration files """ @@ -141,6 +143,7 @@ def get_connector_files(connector_dir): 'state': os.path.join(connector_dir, 'state.json'), 'transformation': os.path.join(connector_dir, 'transformation.json'), 'selection': os.path.join(connector_dir, 'selection.json'), + 'pidfile': os.path.join(connector_dir, 'pipelinewise.pid') } def save(self): diff --git a/pipelinewise/cli/constants.py b/pipelinewise/cli/constants.py new file mode 100644 index 000000000..b676c81ec --- /dev/null +++ b/pipelinewise/cli/constants.py @@ -0,0 +1,17 @@ +import enum + + +class ConnectorType(enum.Enum): + """ + Enums for various Singer connector type names + Todo: add more + """ + TAP_MYSQL = 'tap-mysql' + TAP_POSTGRES = 'tap-postgres' + TAP_MONGODB = 'tap-mongodb' + TAP_S3_CSV = 'tap-s3-csv' + + TARGET_BIGQUERY = 'target-bigquery' + TARGET_POSTGRES = 'target-postgres' + TARGET_SNOWFLAKE = 'target-snowflake' + TARGET_REDSHIFT = 'target-redshift' diff --git a/pipelinewise/cli/pipelinewise.py b/pipelinewise/cli/pipelinewise.py index db1abd7e4..6ffdcbbfb 100644 --- a/pipelinewise/cli/pipelinewise.py +++ b/pipelinewise/cli/pipelinewise.py @@ -13,17 +13,44 @@ from datetime import datetime from time import time -from typing import Dict, Optional, List +from typing import Dict, Optional, List, Any from joblib import Parallel, delayed, parallel_backend from tabulate import tabulate from . import utils +from .constants import ConnectorType from . import commands from .commands import TapParams, TargetParams, TransformParams from .config import Config from .alert_sender import AlertSender from .alert_handlers.base_alert_handler import BaseAlertHandler +FASTSYNC_PAIRS = { + ConnectorType.TAP_MYSQL: { + ConnectorType.TARGET_SNOWFLAKE, + ConnectorType.TARGET_REDSHIFT, + ConnectorType.TARGET_POSTGRES, + ConnectorType.TARGET_BIGQUERY + }, + ConnectorType.TAP_POSTGRES: { + ConnectorType.TARGET_SNOWFLAKE, + ConnectorType.TARGET_REDSHIFT, + ConnectorType.TARGET_POSTGRES, + ConnectorType.TARGET_BIGQUERY + }, + ConnectorType.TAP_S3_CSV: { + ConnectorType.TARGET_SNOWFLAKE, + ConnectorType.TARGET_REDSHIFT, + ConnectorType.TARGET_POSTGRES, + ConnectorType.TARGET_BIGQUERY + }, + ConnectorType.TAP_MONGODB: { + ConnectorType.TARGET_SNOWFLAKE, + ConnectorType.TARGET_POSTGRES, + ConnectorType.TARGET_BIGQUERY + }, +} + # pylint: disable=too-many-lines,too-many-instance-attributes,too-many-public-methods class PipelineWise: @@ -115,7 +142,12 @@ def create_consumable_target_config(self, target_config, tap_inheritable_config) raise Exception(f'Cannot merge JSON files {dict_a} {dict_b} - {exc}') from exc # pylint: disable=too-many-statements,too-many-branches,too-many-nested-blocks,too-many-locals,too-many-arguments - def create_filtered_tap_properties(self, target_type, tap_type, tap_properties, tap_state, filters, + def create_filtered_tap_properties(self, + target_type: ConnectorType, + tap_type: ConnectorType, + tap_properties: str, + tap_state: str, + filters: Dict[str, Any], create_fallback=False): """ Create a filtered version of tap properties file based on specific filter conditions. @@ -132,11 +164,10 @@ def create_filtered_tap_properties(self, target_type, tap_type, tap_properties, """ # Get filter conditions with default values from input dictionary # Nothing selected by default - f_selected = filters.get('selected', None) - f_target_type = filters.get('target_type', None) - f_tap_type = filters.get('tap_type', None) + f_selected: bool = filters.get('selected', False) + f_tap_target_pairs: Dict = filters.get('tap_target_pairs', {}) f_replication_method = filters.get('replication_method', None) - f_initial_sync_required = filters.get('initial_sync_required', None) + f_initial_sync_required: bool = filters.get('initial_sync_required', False) # Lists of tables that meet and don't meet the filter criteria filtered_tap_stream_ids = [] @@ -200,8 +231,7 @@ def create_filtered_tap_properties(self, target_type, tap_type, tap_properties, # pylint: disable=too-many-boolean-expressions if ( (f_selected is None or selected == f_selected) and - (f_target_type is None or target_type in f_target_type) and - (f_tap_type is None or tap_type in f_tap_type) and + (f_tap_target_pairs is None or target_type in f_tap_target_pairs.get(tap_type, set())) and (f_replication_method is None or replication_method in f_replication_method) and (f_initial_sync_required is None or initial_sync_required == f_initial_sync_required) ): @@ -254,10 +284,10 @@ def create_filtered_tap_properties(self, target_type, tap_type, tap_properties, suffix='.json')[1] utils.save_json(fallback_properties, temp_fallback_properties_path) - return temp_properties_path, \ - filtered_tap_stream_ids, \ - temp_fallback_properties_path, \ - fallback_filtered_stream_ids + return (temp_properties_path, + filtered_tap_stream_ids, + temp_fallback_properties_path, + fallback_filtered_stream_ids) # Fallback not required: Save only the filtered properties JSON temp_properties_path = utils.create_temp_file(dir=self.get_temp_dir(), @@ -318,21 +348,6 @@ def get_connector_python_bin(self, connector_type): """ return os.path.join(self.venv_dir, connector_type, 'bin', 'python') - @classmethod - def get_connector_files(cls, connector_dir): - """ - Get connector file paths - """ - return { - 'config': os.path.join(connector_dir, 'config.json'), - 'inheritable_config': os.path.join(connector_dir, 'inheritable_config.json'), - 'properties': os.path.join(connector_dir, 'properties.json'), - 'state': os.path.join(connector_dir, 'state.json'), - 'transformation': os.path.join(connector_dir, 'transformation.json'), - 'selection': os.path.join(connector_dir, 'selection.json'), - 'pidfile': os.path.join(connector_dir, 'pipelinewise.pid') - } - def get_targets(self): """ Get every target @@ -353,14 +368,14 @@ def get_target(self, target_id: str) -> Dict: self.logger.debug('Getting %s target', target_id) targets = self.get_targets() - target = next((item for item in targets if item['id'] == target_id), False) + target = next((item for item in targets if item['id'] == target_id), None) if not target: raise Exception(f'Cannot find {target_id} target') target_dir = self.get_target_dir(target_id) if os.path.isdir(target_dir): - target['files'] = self.get_connector_files(target_dir) + target['files'] = Config.get_connector_files(target_dir) else: raise Exception(f'Cannot find target at {target_dir}') @@ -385,21 +400,21 @@ def get_taps(self, target_id): return taps - def get_tap(self, target_id, tap_id): + def get_tap(self, target_id: str, tap_id: str) -> Dict: """ Get tap by id from a specific target """ self.logger.debug('Getting %s tap from target %s', tap_id, target_id) taps = self.get_taps(target_id) - tap = next((item for item in taps if item['id'] == tap_id), False) + tap = next((item for item in taps if item['id'] == tap_id), None) if not tap: raise Exception(f'Cannot find {tap_id} tap in {target_id} target') tap_dir = self.get_tap_dir(target_id, tap_id) if os.path.isdir(tap_dir): - tap['files'] = self.get_connector_files(tap_dir) + tap['files'] = Config.get_connector_files(tap_dir) else: raise Exception(f'Cannot find tap at {tap_dir}') @@ -424,7 +439,7 @@ def merge_schemas(self, old_schema, new_schema): for new_stream_idx, new_stream in enumerate(new_streams): new_tap_stream_id = new_stream['tap_stream_id'] - old_stream = next((item for item in old_streams if item['tap_stream_id'] == new_tap_stream_id), False) + old_stream = next((item for item in old_streams if item['tap_stream_id'] == new_tap_stream_id), None) # Is this a new stream? if not old_stream: @@ -562,7 +577,7 @@ def make_default_selection(self, schema, selection_file): streams = schema['streams'] for stream_idx, stream in enumerate(streams): tap_stream_id = stream.get('tap_stream_id') - tap_stream_sel = False + tap_stream_sel = None for sel in selection: if 'tap_stream_id' in sel and tap_stream_id.lower() == sel['tap_stream_id'].lower(): tap_stream_sel = sel @@ -739,7 +754,7 @@ def detect_tap_status(self, target_id, tap_id): self.logger.debug('Detecting %s tap status in %s target', tap_id, target_id) tap_dir = self.get_tap_dir(target_id, tap_id) log_dir = self.get_tap_log_dir(target_id, tap_id) - connector_files = self.get_connector_files(tap_dir) + connector_files = Config.get_connector_files(tap_dir) status = { 'currentStatus': 'unknown', 'lastStatus': 'unknown', @@ -973,13 +988,13 @@ def run_tap(self): tap_properties_singer, singer_stream_ids ) = self.create_filtered_tap_properties( - target_type, - tap_type, + ConnectorType(target_type), + ConnectorType(tap_type), tap_properties, - tap_state, { + tap_state, + { 'selected': True, - 'target_type': ['target-snowflake', 'target-redshift', 'target-postgres', 'target-bigquery'], - 'tap_type': ['tap-mysql', 'tap-postgres', 'tap-s3-csv', 'tap-mongodb'], + 'tap_target_pairs': FASTSYNC_PAIRS, 'initial_sync_required': True }, create_fallback=True) @@ -1346,13 +1361,13 @@ def _is_initial_sync_required(self, replication_method: str, stream_bookmark: Di :return: Boolean, True if needs initial sync, False otherwise """ return replication_method == self.FULL_TABLE \ - or (replication_method == self.INCREMENTAL and - 'replication_key_value' not in stream_bookmark and - 'modified_since' not in stream_bookmark) \ - or (replication_method == self.LOG_BASED and - 'lsn' not in stream_bookmark and - 'log_pos' not in stream_bookmark and - 'token' not in stream_bookmark) + or (replication_method == self.INCREMENTAL and + 'replication_key_value' not in stream_bookmark and + 'modified_since' not in stream_bookmark) \ + or (replication_method == self.LOG_BASED and + 'lsn' not in stream_bookmark and + 'log_pos' not in stream_bookmark and + 'token' not in stream_bookmark) # pylint: disable=unused-argument def _exit_gracefully(self, sig, frame, exit_code=1): diff --git a/pipelinewise/fastsync/README.md b/pipelinewise/fastsync/README.md index 7c7d650d9..28c44eeeb 100644 --- a/pipelinewise/fastsync/README.md +++ b/pipelinewise/fastsync/README.md @@ -13,17 +13,10 @@ components and uses it automatically whenever it’s possible. ## Supported tap-target routes -* MySQL to Snowflake. -* MySQL to Redshift -* MySQL to Postgres -* Postgres to Snowflake -* Postgres to Redshift -* Postgres to Postgres - -* S3 CSV to Snowflake -* S3 CSV to Redshift -* S3 CSV to Postgres - -* MongoDB to Snowflake -* MongoDB to Postgres +| Source | Destination | +|---|---| +| MySQL/MariaDB | * BigQuery
* Snowflake
* Postgres
* Redshift | +| Postgres | * BigQuery
* Snowflake
* Postgres
* Redshift | +| S3 CSV | * BigQuery
* Snowflake
* Postgres
* Redshift | +| MongoDB | * BigQuery
* Snowflake
* Postgres
| diff --git a/tests/units/cli/test_cli.py b/tests/units/cli/test_cli.py index e426edf9c..db02a4994 100644 --- a/tests/units/cli/test_cli.py +++ b/tests/units/cli/test_cli.py @@ -5,12 +5,15 @@ import signal import psutil import pidfile -from pathlib import Path - import pytest + +from pathlib import Path from unittest.mock import patch + from tests.units.cli.cli_args import CliArgs from pipelinewise import cli +from pipelinewise.cli.constants import ConnectorType +from pipelinewise.cli.config import Config from pipelinewise.cli.pipelinewise import PipelineWise RESOURCES_DIR = '{}/resources'.format(os.path.dirname(__file__)) @@ -21,6 +24,9 @@ PROFILING_DIR = './profiling' +# Can't inherit from unittest.TestCase because it breaks pytest fixture +# https://github.com/pytest-dev/pytest/issues/2504#issuecomment-308828149 + # pylint: disable=no-self-use,too-many-public-methods,attribute-defined-outside-init,fixme class TestCli: """ @@ -64,22 +70,6 @@ def test_connector_bin(self): self.pipelinewise.get_connector_bin('dummy-type') == \ '{}/dummy-type/bin/dummy-type'.format(VIRTUALENVS_DIR) - def test_connector_files(self): - """Every singer connector must have a list of JSON files at certain locations""" - # TODO: get_connector_files is duplicated in config.py and pipelinewise.py - # Refactor to use only one - assert \ - self.pipelinewise.get_connector_files('/var/singer-connector') == \ - { - 'config': '/var/singer-connector/config.json', - 'inheritable_config': '/var/singer-connector/inheritable_config.json', - 'properties': '/var/singer-connector/properties.json', - 'state': '/var/singer-connector/state.json', - 'transformation': '/var/singer-connector/transformation.json', - 'selection': '/var/singer-connector/selection.json', - 'pidfile': '/var/singer-connector/pipelinewise.pid' - } - def test_not_existing_config_dir(self): """Test with not existing config dir""" # Create a new pipelinewise object pointing to a not existing config directory @@ -103,8 +93,8 @@ def test_get_target(self): exp_target_two = next((item for item in targets if item['id'] == 'target_two'), False) # Append the connector file paths to the expected targets - exp_target_one['files'] = self.pipelinewise.get_connector_files('{}/target_one'.format(CONFIG_DIR)) - exp_target_two['files'] = self.pipelinewise.get_connector_files('{}/target_two'.format(CONFIG_DIR)) + exp_target_one['files'] = Config.get_connector_files('{}/target_one'.format(CONFIG_DIR)) + exp_target_two['files'] = Config.get_connector_files('{}/target_two'.format(CONFIG_DIR)) # Getting target by ID should match to original JSON and should contains the connector files list assert self.pipelinewise.get_target('target_one') == exp_target_one @@ -140,7 +130,7 @@ def test_get_tap(self): # Append the tap status, files and target keys to the tap exp_tap_one = target_one['taps'][0] exp_tap_one['status'] = self.pipelinewise.detect_tap_status('target_one', exp_tap_one['id']) - exp_tap_one['files'] = self.pipelinewise.get_connector_files('{}/target_one/tap_one'.format(CONFIG_DIR)) + exp_tap_one['files'] = Config.get_connector_files('{}/target_one/tap_one'.format(CONFIG_DIR)) exp_tap_one['target'] = self.pipelinewise.get_target('target_one') # Getting tap by ID should match to original JSON and should contain status, connector files and target props @@ -175,16 +165,18 @@ def test_create_filtered_tap_props(self): tap_properties_singer, singer_stream_ids ) = self.pipelinewise.create_filtered_tap_properties( - target_type='target-snowflake', - tap_type='tap-mysql', + target_type=ConnectorType('target-snowflake'), + tap_type=ConnectorType('tap-mysql'), tap_properties='{}/resources/sample_json_config/target_one/tap_one/properties.json'.format( os.path.dirname(__file__)), tap_state='{}/resources/sample_json_config/target_one/tap_one/state.json'.format( os.path.dirname(__file__)), filters={ 'selected': True, - 'target_type': ['target-snowflake'], - 'tap_type': ['tap-mysql', 'tap-postgres'], + 'tap_target_pairs': { + ConnectorType.TAP_MYSQL: {ConnectorType.TARGET_SNOWFLAKE}, + ConnectorType.TAP_POSTGRES: {ConnectorType.TARGET_SNOWFLAKE} + }, 'initial_sync_required': True }, create_fallback=True) @@ -201,6 +193,42 @@ def test_create_filtered_tap_props(self): assert fastsync_stream_ids == ['db_test_mysql-table_one', 'db_test_mysql-table_two'] assert singer_stream_ids == ['db_test_mysql-table_one', 'db_test_mysql-table_two'] + def test_create_filtered_tap_props_no_fastsync(self): + """Test creating only singer specific properties file""" + ( + tap_properties_fastsync, + fastsync_stream_ids, + tap_properties_singer, + singer_stream_ids + ) = self.pipelinewise.create_filtered_tap_properties( + target_type=ConnectorType('target-snowflake'), + tap_type=ConnectorType('tap-mysql'), + tap_properties='{}/resources/sample_json_config/target_one/tap_one/properties.json'.format( + os.path.dirname(__file__)), + tap_state='{}/resources/sample_json_config/target_one/tap_one/state.json'.format( + os.path.dirname(__file__)), + filters={ + 'selected': True, + 'tap_target_pairs': { + ConnectorType.TAP_MYSQL: {ConnectorType.TARGET_REDSHIFT}, + ConnectorType.TAP_POSTGRES: {ConnectorType.TARGET_SNOWFLAKE} + }, + 'initial_sync_required': True + }, + create_fallback=True) + + # fastsync and singer properties should be created + assert os.path.isfile(tap_properties_fastsync) + assert os.path.isfile(tap_properties_singer) + + # Delete generated properties file + os.remove(tap_properties_fastsync) + os.remove(tap_properties_singer) + + # only singer properties should be created + assert fastsync_stream_ids == [] + assert singer_stream_ids == ['db_test_mysql-table_one', 'db_test_mysql-table_two'] + def test_merge_empty_catalog(self): """Merging two empty singer schemas should be another empty""" # TODO: Check if pipelinewise.merge_schemas is required at all or not @@ -550,7 +578,7 @@ def test_post_import_checks(self): assert len(pipelinewise._run_post_import_tap_checks(tap_with_trans, tap_with_no_pk_incremental, 'snowflake')) == 2 - # mock successfull transformation validation command + # mock successful transformation validation command run_command_mock.return_value = (0, None, None) assert len(pipelinewise._run_post_import_tap_checks(tap_with_trans, tap_with_no_pk_not_selected, diff --git a/tests/units/cli/test_config.py b/tests/units/cli/test_config.py index 70b786c66..54a3ec98a 100644 --- a/tests/units/cli/test_config.py +++ b/tests/units/cli/test_config.py @@ -1,12 +1,13 @@ import os import shutil +import pytest from pipelinewise import cli -import pytest +from pipelinewise.cli.config import Config PIPELINEWISE_TEST_HOME = '/tmp/.pipelinewise' - +# Todo: Inherit from unittest.TestCase # pylint: disable=no-self-use,fixme class TestConfig: """ @@ -15,13 +16,26 @@ class TestConfig: def test_constructor(self): """Test Config construction functions""" - config = cli.config.Config(PIPELINEWISE_TEST_HOME) + config = Config(PIPELINEWISE_TEST_HOME) # config dir and path should be generated automatically assert config.config_dir == PIPELINEWISE_TEST_HOME assert config.config_path == '{}/config.json'.format(PIPELINEWISE_TEST_HOME) assert config.targets == {} + def test_connector_files(self): + """Every singer connector must have a list of JSON files at certain locations""" + assert Config.get_connector_files('/var/singer-connector') == \ + { + 'config': '/var/singer-connector/config.json', + 'inheritable_config': '/var/singer-connector/inheritable_config.json', + 'properties': '/var/singer-connector/properties.json', + 'state': '/var/singer-connector/state.json', + 'transformation': '/var/singer-connector/transformation.json', + 'selection': '/var/singer-connector/selection.json', + 'pidfile': '/var/singer-connector/pipelinewise.pid' + } + def test_from_yamls(self): """Test creating Config object using YAML configuration directory as the input""" @@ -30,7 +44,7 @@ def test_from_yamls(self): vault_secret = '{}/resources/vault-secret.txt'.format(os.path.dirname(__file__)) # Parse YAML files and create the config object - config = cli.config.Config.from_yamls(PIPELINEWISE_TEST_HOME, yaml_config_dir, vault_secret) + config = Config.from_yamls(PIPELINEWISE_TEST_HOME, yaml_config_dir, vault_secret) # config dir and path should be generated automatically assert config.config_dir == PIPELINEWISE_TEST_HOME @@ -73,7 +87,8 @@ def test_from_yamls(self): 'properties': '{}/test_snowflake_target/properties.json'.format(PIPELINEWISE_TEST_HOME), 'selection': '{}/test_snowflake_target/selection.json'.format(PIPELINEWISE_TEST_HOME), 'state': '{}/test_snowflake_target/state.json'.format(PIPELINEWISE_TEST_HOME), - 'transformation': '{}/test_snowflake_target/transformation.json'.format(PIPELINEWISE_TEST_HOME) + 'transformation': '{}/test_snowflake_target/transformation.json'.format(PIPELINEWISE_TEST_HOME), + 'pidfile': '{}/test_snowflake_target/pipelinewise.pid'.format(PIPELINEWISE_TEST_HOME), }, 'taps': [{ 'id': 'mysql_sample', @@ -106,7 +121,9 @@ def test_from_yamls(self): 'state': '{}/test_snowflake_target/mysql_sample/state.json'.format(PIPELINEWISE_TEST_HOME), 'transformation': - '{}/test_snowflake_target/mysql_sample/transformation.json'.format(PIPELINEWISE_TEST_HOME) + '{}/test_snowflake_target/mysql_sample/transformation.json'.format(PIPELINEWISE_TEST_HOME), + 'pidfile': '{}/test_snowflake_target/mysql_sample/pipelinewise.pid'.format( + PIPELINEWISE_TEST_HOME) }, 'schemas': [{ 'source_schema': 'my_db', @@ -133,7 +150,7 @@ def test_from_invalid_mongodb_yamls(self): vault_secret = '{}/resources/vault-secret.txt'.format(os.path.dirname(__file__)) print(yaml_config_dir) with pytest.raises(SystemExit) as pytest_wrapped_e: - cli.config.Config.from_yamls(PIPELINEWISE_TEST_HOME, yaml_config_dir, vault_secret) + Config.from_yamls(PIPELINEWISE_TEST_HOME, yaml_config_dir, vault_secret) assert pytest_wrapped_e.type == SystemExit assert pytest_wrapped_e.value.code == 1 @@ -146,14 +163,14 @@ def test_from_invalid_yamls(self): # Initialising Config object with a not existing directory should raise an exception with pytest.raises(Exception): - cli.config.Config.from_yamls(PIPELINEWISE_TEST_HOME, 'not-existing-yaml-config-directory') + Config.from_yamls(PIPELINEWISE_TEST_HOME, 'not-existing-yaml-config-directory') # Initialising config object with a tap that's referencing an unknown target should exit yaml_config_dir = '{}/resources/test_invalid_yaml_config'.format(os.path.dirname(__file__)) vault_secret = '{}/resources/vault-secret.txt'.format(os.path.dirname(__file__)) with pytest.raises(SystemExit) as pytest_wrapped_e: - cli.config.Config.from_yamls(PIPELINEWISE_TEST_HOME, yaml_config_dir, vault_secret) + Config.from_yamls(PIPELINEWISE_TEST_HOME, yaml_config_dir, vault_secret) assert pytest_wrapped_e.type == SystemExit assert pytest_wrapped_e.value.code == 1 @@ -168,20 +185,20 @@ def test_from_invalid_yamls_fails(self): # Initialising Config object with a not existing directory should raise an exception with pytest.raises(Exception): - cli.config.Config.from_yamls(PIPELINEWISE_TEST_HOME, 'not-existing-yaml-config-directory') + Config.from_yamls(PIPELINEWISE_TEST_HOME, 'not-existing-yaml-config-directory') # Initialising config object with a tap that's referencing an unknown target should exit yaml_config_dir = f'{os.path.dirname(__file__)}/resources/test_invalid_yaml_config_with_duplicate_targets' vault_secret = f'{os.path.dirname(__file__)}/resources/vault-secret.txt' with pytest.raises(SystemExit) as pytest_wrapped_e: - cli.config.Config.from_yamls(PIPELINEWISE_TEST_HOME, yaml_config_dir, vault_secret) + Config.from_yamls(PIPELINEWISE_TEST_HOME, yaml_config_dir, vault_secret) assert pytest_wrapped_e.type == SystemExit assert pytest_wrapped_e.value.code == 1 def test_getters(self): """Test Config getter functions""" - config = cli.config.Config(PIPELINEWISE_TEST_HOME) + config = Config(PIPELINEWISE_TEST_HOME) # Target and tap directory should be g assert config.get_temp_dir() == '{}/tmp'.format(PIPELINEWISE_TEST_HOME) @@ -189,8 +206,6 @@ def test_getters(self): assert config.get_tap_dir('test-target-id', 'test-tap-id') == '{}/test-target-id/test-tap-id'.format(PIPELINEWISE_TEST_HOME) - # TODO: get_connector_files is duplicated in config.py and pipelinewise.py - # Refactor to use only one assert \ config.get_connector_files('/var/singer-connector') == \ { @@ -199,7 +214,8 @@ def test_getters(self): 'properties': '/var/singer-connector/properties.json', 'state': '/var/singer-connector/state.json', 'transformation': '/var/singer-connector/transformation.json', - 'selection': '/var/singer-connector/selection.json' + 'selection': '/var/singer-connector/selection.json', + 'pidfile': '/var/singer-connector/pipelinewise.pid' } def test_save_config(self): @@ -210,7 +226,7 @@ def test_save_config(self): vault_secret = '{}/resources/vault-secret.txt'.format(os.path.dirname(__file__)) json_config_dir = './pipelinewise-test-config' - config = cli.config.Config.from_yamls(json_config_dir, yaml_config_dir, vault_secret) + config = Config.from_yamls(json_config_dir, yaml_config_dir, vault_secret) # Save the config as singer compatible JSON files config.save() From dc2363d370d219e4114b64a8d67d675c6029642d Mon Sep 17 00:00:00 2001 From: Samira El Aabidi <54845154+Samira-El@users.noreply.github.com> Date: Mon, 30 Aug 2021 13:11:28 +0300 Subject: [PATCH 16/78] AP-1010 Fix Decimal not JSON serializable in MongoDB FastSync (#786) --- .circleci/config.yml | 18 +- .github/workflows/connectors.yml | 30 +++ .github/workflows/dockerhub.yml | 1 - .github/workflows/linter.yml | 5 +- ...linewise_unit_tests.yml => unit_tests.yml} | 6 +- dev-project/entrypoint.sh | 31 ++- .../tap_mongodb_to_pg.yaml | 3 + pipelinewise/fastsync/commons/errors.py | 3 + pipelinewise/fastsync/commons/tap_mongodb.py | 188 +++++++++++------- pylintrc | 1 - setup.py | 4 +- tests/db/mongodb_data/all_datatypes.bson.gz | Bin 0 -> 10797 bytes tests/db/tap_mongodb.sh | 8 + .../tap_mongodb_to_bq.yml.template | 3 + .../tap_mongodb_to_pg.yml.template | 3 + .../tap_mongodb_to_sf.yml.template | 3 + tests/end_to_end/test_target_bigquery.py | 5 + tests/end_to_end/test_target_postgres.py | 5 + tests/end_to_end/test_target_snowflake.py | 5 + .../commons/test_fastsync_tap_mongodb.py | 72 ++++++- 20 files changed, 289 insertions(+), 105 deletions(-) create mode 100644 .github/workflows/connectors.yml rename .github/workflows/{pipelinewise_unit_tests.yml => unit_tests.yml} (88%) create mode 100644 tests/db/mongodb_data/all_datatypes.bson.gz diff --git a/.circleci/config.yml b/.circleci/config.yml index 88a3d1a8f..db5de8242 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -97,10 +97,17 @@ jobs: if [[ $rc -eq 1 ]] then + sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 4B7C549A058F8B6B + echo "deb [ arch=amd64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/4.2 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb.list sudo apt-get update - sudo apt install mariadb-client postgresql-client mongo-tools mbuffer gettext-base - wget https://repo.mongodb.org/apt/ubuntu/dists/bionic/mongodb-org/4.2/multiverse/binary-amd64/mongodb-org-shell_4.2.7_amd64.deb - sudo dpkg -i ./mongodb-org-shell_4.2.7_amd64.deb && rm mongodb-org-shell_4.2.7_amd64.deb + sudo apt install --no-install-recommends \ + mariadb-client \ + postgresql-client \ + mongo-tools \ + mbuffer \ + gettext-base \ + mongodb-org-shell=4.2.7 + ./dev-project/mongo/init_rs.sh ./tests/db/tap_mysql_db.sh @@ -108,11 +115,10 @@ jobs: ./tests/db/tap_mongodb.sh ./tests/db/target_postgres.sh - ./install.sh --acceptlicenses --connectors=all - + ./install.sh --acceptlicenses --connectors=target-snowflake,target-postgres,target-bigquery,tap-mysql,tap-postgres,tap-mongodb,transform-field,tap-s3-csv . .virtualenvs/pipelinewise/bin/activate export PIPELINEWISE_HOME=$PWD - pytest tests/end_to_end -v + pytest tests/end_to_end -vx fi no_output_timeout: 30m diff --git a/.github/workflows/connectors.yml b/.github/workflows/connectors.yml new file mode 100644 index 000000000..e7838201a --- /dev/null +++ b/.github/workflows/connectors.yml @@ -0,0 +1,30 @@ +# Workflow to check if all singer connectors are installable +name: Singer connectors + +on: + push: + branches: [master] + pull_request: + branches: [master] + + workflow_dispatch: + +jobs: + check: + runs-on: ubuntu-20.04 + strategy: + matrix: + python-version: [3.7] + + steps: + - name: Checking out repo + uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Check all connectors are installable + run: | + ./install.sh --acceptlicenses --connectors=all diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml index c0dcfe989..ae80a4a20 100644 --- a/.github/workflows/dockerhub.yml +++ b/.github/workflows/dockerhub.yml @@ -6,7 +6,6 @@ on: - published jobs: - build: runs-on: ubuntu-latest diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml index c179e29c8..568ad7e1b 100644 --- a/.github/workflows/linter.yml +++ b/.github/workflows/linter.yml @@ -9,8 +9,7 @@ on: workflow_dispatch: jobs: - build: - + check: runs-on: ubuntu-20.04 strategy: matrix: @@ -28,7 +27,7 @@ jobs: continue-on-error: true run: ./scripts/ci_check_no_file_changes.sh python - - name: Set up Python ${{ matrix.container[1] }} + - name: Set up Python ${{ matrix.python-version }} if: steps.check.outcome == 'failure' uses: actions/setup-python@v2 with: diff --git a/.github/workflows/pipelinewise_unit_tests.yml b/.github/workflows/unit_tests.yml similarity index 88% rename from .github/workflows/pipelinewise_unit_tests.yml rename to .github/workflows/unit_tests.yml index 6eb33100b..ed2f890ec 100644 --- a/.github/workflows/pipelinewise_unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -9,7 +9,7 @@ on: workflow_dispatch: jobs: - build: + test: runs-on: ubuntu-20.04 strategy: @@ -28,7 +28,7 @@ jobs: continue-on-error: true run: ./scripts/ci_check_no_file_changes.sh python - - name: Set up Python ${{ matrix.container[1] }} + - name: Set up Python ${{ matrix.python-version }} if: steps.check.outcome == 'failure' uses: actions/setup-python@v2 with: @@ -44,4 +44,4 @@ jobs: if: steps.check.outcome == 'failure' run: | export PIPELINEWISE_HOME=$PWD - pytest --cov=pipelinewise --cov-fail-under=69 -v tests/units + pytest --cov=pipelinewise --cov-fail-under=72 -v tests/units diff --git a/dev-project/entrypoint.sh b/dev-project/entrypoint.sh index a90b2a450..880b8350d 100755 --- a/dev-project/entrypoint.sh +++ b/dev-project/entrypoint.sh @@ -1,14 +1,24 @@ #!/usr/bin/env bash +set -e + +# Add Mongodb ppa +apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 4B7C549A058F8B6B +echo "deb [ arch=amd64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/4.2 multiverse" | tee /etc/apt/sources.list.d/mongodb.list + # Install OS dependencies apt-get update -apt-get install -y mariadb-client postgresql-client alien libaio1 mongo-tools mbuffer gettext-base +apt-get install -y --no-install-recommends \ + alien \ + gettext-base \ + libaio1 \ + mariadb-client \ + mbuffer \ + mongo-tools \ + mongodb-org-shell=4.2.7 \ + postgresql-client -wget https://repo.mongodb.org/apt/ubuntu/dists/bionic/mongodb-org/4.2/multiverse/binary-amd64/mongodb-org-shell_4.2.7_amd64.deb -dpkg -i ./mongodb-org-shell_4.2.7_amd64.deb && rm mongodb-org-shell_4.2.7_amd64.deb - -# Change to dev-project folder -cd dev-project +rm -rf /var/lib/apt/lists/* \ # Install Oracle Instant Client required for tap-oracle # ORA_INSTACLIENT_URL=https://download.oracle.com/otn_software/linux/instantclient/193000/oracle-instantclient19.3-basiclite-19.3.0.0.0-1.x86_64.rpm @@ -17,16 +27,21 @@ cd dev-project # alien -i oracle-instantclient.rpm --scripts # rm -f oracle-instantclient.rpm + +# Change to dev-project folder +cd dev-project + +# Install PipelineWise in the container + # Build test databasese ../tests/db/tap_mysql_db.sh ../tests/db/tap_postgres_db.sh ./mongo/init_rs.sh ../tests/db/tap_mongodb.sh - ../tests/db/target_postgres.sh -# Install PipelineWise in the container +# Install PipelineWise and connectors in the container ../install.sh --acceptlicenses --nousage --connectors=target-snowflake,target-postgres,target-bigquery,tap-mysql,tap-postgres,tap-mongodb,transform-field,tap-s3-csv if [[ $? != 0 ]]; then echo diff --git a/dev-project/pipelinewise-config/tap_mongodb_to_pg.yaml b/dev-project/pipelinewise-config/tap_mongodb_to_pg.yaml index 8aedf7761..8825b2d4a 100644 --- a/dev-project/pipelinewise-config/tap_mongodb_to_pg.yaml +++ b/dev-project/pipelinewise-config/tap_mongodb_to_pg.yaml @@ -42,3 +42,6 @@ schemas: - table_name: "my_collection" replication_method: "LOG_BASED" + + - table_name: "all_datatypes" + replication_method: "LOG_BASED" diff --git a/pipelinewise/fastsync/commons/errors.py b/pipelinewise/fastsync/commons/errors.py index 405ff4a6e..f774367cd 100644 --- a/pipelinewise/fastsync/commons/errors.py +++ b/pipelinewise/fastsync/commons/errors.py @@ -6,3 +6,6 @@ class TableNotFoundError(Exception): class MongoDBInvalidDatetimeError(Exception): """Raised when a bson datetime is invalid and cannot be serialized""" + +class UnsupportedKeyTypeException(Exception): + """Raised if key type is unsupported""" diff --git a/pipelinewise/fastsync/commons/tap_mongodb.py b/pipelinewise/fastsync/commons/tap_mongodb.py index 5b908b144..f8ec7304a 100644 --- a/pipelinewise/fastsync/commons/tap_mongodb.py +++ b/pipelinewise/fastsync/commons/tap_mongodb.py @@ -2,7 +2,7 @@ import csv import datetime import gzip -import json +import ujson import logging import os import ssl @@ -12,87 +12,126 @@ import pytz import tzlocal -from typing import Tuple, Optional, Dict, Callable +from typing import Tuple, Optional, Dict, Callable, Any from pymongo import MongoClient from pymongo.database import Database from singer.utils import strftime as singer_strftime from . import utils, split_gzip -from .errors import ExportError, TableNotFoundError, MongoDBInvalidDatetimeError +from .errors import ExportError, TableNotFoundError, MongoDBInvalidDatetimeError, UnsupportedKeyTypeException LOGGER = logging.getLogger(__name__) DEFAULT_WRITE_BATCH_ROWS = 50000 -class MongoDBJsonEncoder(json.JSONEncoder): +def serialize_document(document: Dict) -> Dict: """ - Custom JSON encoder to be used to serialize data from MongoDB + serialize mongodb Document into a json object + + Args: + document: MongoDB document + + Returns: Dict """ - @staticmethod - def _serialize_datetime(val): - """ - Serialize Bson and python datetime types - Args: - val: datetime value + return {key: transform_value(val, [key]) for key, val in document.items() + if not isinstance(val, (bson.min_key.MinKey, bson.max_key.MaxKey))} - Returns: serialized datetime value - """ - if isinstance(val, bson.datetime.datetime): - timezone = tzlocal.get_localzone() - try: - local_datetime = timezone.localize(val) - utc_datetime = local_datetime.astimezone(pytz.UTC) - except Exception as exc: - if str(exc) == 'year is out of range' and val.year == 0: - # NB: Since datetimes are persisted as strings, it doesn't - # make sense to blow up on invalid Python datetimes (e.g., - # year=0). In this case we're formatting it as a string and - # passing it along down the pipeline. - return '{:04d}-{:02d}-{:02d}T{:02d}:{:02d}:{:02d}.{:06d}Z'.format(val.year, - val.month, - val.day, - val.hour, - val.minute, - val.second, - val.microsecond) - raise MongoDBInvalidDatetimeError('Found invalid datetime {}'.format(val)) from exc - - return singer_strftime(utc_datetime) - - if isinstance(val, datetime.datetime): +def class_to_string(key_value: Any, key_type: str) -> str: + """ + Converts specific types to string equivalent + The supported types are: datetime, bson Timestamp, bytes, int, Int64, float, ObjectId, str and UUID + Args: + key_value: The value to convert to string + key_type: the value type + + Returns: string equivalent of key value + Raises: UnsupportedKeyTypeException if key_type is not supported + """ + if key_type == 'datetime': + if key_value.tzinfo is None: timezone = tzlocal.get_localzone() - local_datetime = timezone.localize(val) + local_datetime = timezone.localize(key_value) utc_datetime = local_datetime.astimezone(pytz.UTC) - return singer_strftime(utc_datetime) - return None + else: + utc_datetime = key_value.astimezone(pytz.UTC) - def default(self, o): # false positive complaint -> pylint: disable=E0202 - """ - Custom function to serialize several sort of BSON and Python types - Args: - obj: Object to serialize + return singer_strftime(utc_datetime) - Returns: Serialized value - """ - encoding_map = { - bson.objectid.ObjectId: str, - uuid.UUID: str, - bson.int64.Int64: str, - bson.timestamp.Timestamp: lambda value: singer_strftime(value.as_datetime()), - bytes: lambda value: base64.b64encode(value).decode('utf-8'), - bson.decimal128.Decimal128: lambda val: val.to_decimal(), - bson.regex.Regex: lambda val: dict(pattern=val.pattern, flags=val.flags), - bson.code.Code: lambda val: dict(value=str(val), scope=str(val.scope)) if val.scope else str(val), - bson.dbref.DBRef: lambda val: dict(id=str(val.id), collection=val.collection, database=val.database), - datetime.datetime: self._serialize_datetime, - bson.datetime.datetime: self._serialize_datetime - } + if key_type == 'Timestamp': + return '{}.{}'.format(key_value.time, key_value.inc) + + if key_type == 'bytes': + return base64.b64encode(key_value).decode('utf-8') + + if key_type in ['int', 'Int64', 'float', 'ObjectId', 'str', 'UUID']: + return str(key_value) + + raise UnsupportedKeyTypeException('{} is not a supported key type'.format(key_type)) + + +def safe_transform_datetime(value: datetime.datetime, path) -> str: + """ + Safely transform datetime from local tz to UTC if applicable + Args: + value: datetime value to transform + path: + + Returns: utc datetime as string + + """ + timezone = tzlocal.get_localzone() + try: + local_datetime = timezone.localize(value) + utc_datetime = local_datetime.astimezone(pytz.UTC) + except Exception as ex: + if str(ex) == 'year is out of range' and value.year == 0: + # NB: Since datetimes are persisted as strings, it doesn't + # make sense to blow up on invalid Python datetimes (e.g., + # year=0). In this case we're formatting it as a string and + # passing it along down the pipeline. + return '{:04d}-{:02d}-{:02d}T{:02d}:{:02d}:{:02d}.{:06d}Z'.format(value.year, + value.month, + value.day, + value.hour, + value.minute, + value.second, + value.microsecond) + raise MongoDBInvalidDatetimeError('Found invalid datetime at [{}]: {}'.format('.'.join(map(str, path)), + value)) from ex + return singer_strftime(utc_datetime) + + +def transform_value(value: Any, path) -> Any: + """ + transform values to json friendly ones + Args: + value: value to transform + path: - if o.__class__ in encoding_map: - return encoding_map[o.__class__](o) + Returns: transformed value - return super().default(o) + """ + conversion = { + list: lambda val, pat: list(map(lambda v: transform_value(v[1], pat + [v[0]]), enumerate(val))), + dict: lambda val, pat: {k: transform_value(v, pat + [k]) for k, v in val.items()}, + uuid.UUID: lambda val, _: class_to_string(val, 'UUID'), + bson.objectid.ObjectId: lambda val, _: class_to_string(val, 'ObjectId'), + bson.datetime.datetime: safe_transform_datetime, + bson.timestamp.Timestamp: lambda val, _: singer_strftime(val.as_datetime()), + bson.int64.Int64: lambda val, _: class_to_string(val, 'Int64'), + bytes: lambda val, _: class_to_string(val, 'bytes'), + datetime.datetime: lambda val, _: class_to_string(val, 'datetime'), + bson.decimal128.Decimal128: lambda val, _: val.to_decimal(), + bson.regex.Regex: lambda val, _: dict(pattern=val.pattern, flags=val.flags), + bson.code.Code: lambda val, _: dict(value=str(val), scope=str(val.scope)) if val.scope else str(val), + bson.dbref.DBRef: lambda val, _: dict(id=str(val.id), collection=val.collection, database=val.database), + } + + if isinstance(value, tuple(conversion.keys())): + return conversion[type(value)](value, path) + + return value class FastSyncTapMongoDB: @@ -160,6 +199,7 @@ def copy_table(self, with -partXYZ postfix in the filename. (Default: False) split_file_chunk_size_mb: File chunk sizes if `split_large_files` enabled. (Default: 1000) split_file_max_chunks: Max number of chunks if `split_large_files` enabled. (Default: 20) + compress: Flag to indicate whether to compress export files """ table_dict = utils.tablename_to_dict(table_name, '.') @@ -192,13 +232,17 @@ def copy_table(self, # bson.decode_file_iter will generate one document at a time from the exported file for document in bson.decode_file_iter(export_file): - rows.append({ - '_ID': str(document['_id']), - 'DOCUMENT': json.dumps(document, cls=MongoDBJsonEncoder, separators=(',', ':')), - utils.SDC_EXTRACTED_AT: extracted_at, - utils.SDC_BATCHED_AT: datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f'), - utils.SDC_DELETED_AT: None - }) + try: + rows.append({ + '_ID': str(document['_id']), + 'DOCUMENT': ujson.dumps(serialize_document(document)), + utils.SDC_EXTRACTED_AT: extracted_at, + utils.SDC_BATCHED_AT: datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f'), + utils.SDC_DELETED_AT: None + }) + except TypeError: + LOGGER.error('TypeError encountered when processing document ID: %s', document['_id']) + raise exported_rows += 1 @@ -239,7 +283,7 @@ def _get_collection_columns() -> Tuple: (utils.SDC_DELETED_AT, 'string'), ) - def fetch_current_log_pos(self)->Dict: + def fetch_current_log_pos(self) -> Dict: """ Find and returns the latest ChangeStream token. LOG_BASED method uses changes streams. @@ -300,7 +344,7 @@ def map_column_types_to_target(self): 'primary_key': ['_ID'] } - def _export_collection(self, export_dir: str, collection_name)->str: + def _export_collection(self, export_dir: str, collection_name) -> str: """ Dump a collection data into a compressed bson file and returns the path Args: @@ -337,7 +381,7 @@ def _export_collection(self, export_dir: str, collection_name)->str: if return_code != 0: raise ExportError(f'Export failed with code {return_code}') - #mongodump creates two files "{collection_name}.metadata.json.gz" & "{collection_name}.bson.gz" + # mongodump creates two files "{collection_name}.metadata.json.gz" & "{collection_name}.bson.gz" # we are only interested in the latter so we delete the former. os.remove(os.path.join(export_dir, self.connection_config['database'], f'{collection_name}.metadata.json.gz')) return os.path.join(export_dir, self.connection_config['database'], f'{collection_name}.bson.gz') diff --git a/pylintrc b/pylintrc index 67e5096d5..a290fcb40 100644 --- a/pylintrc +++ b/pylintrc @@ -162,7 +162,6 @@ enable=import-error, nonzero-method, t-method, setslice-method, - old-division, logging-format-truncated, logging-too-few-args, logging-too-many-args, diff --git a/setup.py b/setup.py index bb0090e14..9d23d4026 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,8 @@ 'pymongo>=3.10,<3.13', 'tzlocal>=2.0,<2.2', 'slackclient>=2.7,<2.10', - 'psutil==5.8.0' + 'psutil==5.8.0', + 'ujson==4.1.*' ], extras_require={ 'test': [ @@ -43,7 +44,6 @@ 'pytest-dependency==0.4.0', 'pytest-cov==2.12.1', 'python-dotenv==0.19.0', - 'mock==4.0.3', 'pylint==2.10.2', 'unify==0.5' ] diff --git a/tests/db/mongodb_data/all_datatypes.bson.gz b/tests/db/mongodb_data/all_datatypes.bson.gz new file mode 100644 index 0000000000000000000000000000000000000000..a59b54a3e9ecb52da24debc6d2df43612f1fdb46 GIT binary patch literal 10797 zcmYjXWk8ef+XV?}gDwH-?k)jAKsp`W2#oF+h>`+=bR#HTQlqzt8;v2C^zR4to4ILX>6vASsG1g^6XH;XO-AN<&9S zb?3+Il)96;x;kXlNfq*>j)x{jRB%W`Bu$Fe?o(9JMPtHyzG~)#Aw2ArjG^d?wh+~> z7hy^PmX<;w%AsgW%J&t_FCXkW)(blyX_C!cbr;IrC=@QNgx$-|Tuueoodqk*$Omrw zZwF!~;bgJpo0vXGY;}DQLh#@G=Jsuu;*IFp-$31tGk$CDnY^=J^dwW;{q!~C)+O?l z^~~ST@GaEX;3K&|F{e!3_g+$~dIvA7{4IFxCciygrJ9d}M$*@Jus{nEc#Ac!%%cY- zjyST)qSRK^Mis!t2|jims*P3`91be)jL$yWUC58j?oXQq1YMXTgZDAl3O*Qk?%=K$ z{J)>C?t=mYuJe$+0YS)JufMCw-I$X+MO4BLtu^lbBsHVL;mP15>+S%|8S+u=plIK& z#@epz-LJ3pO!vE-gM;M?x1`&CZ3r)^E{sRyw~=X;B9tM zzu;E*jQQ5Z5xQQ%4`ZksatkqK3jW1;@oIZJ;>uLvr&rL3iuLa!N$dMnQKp`I((B&+ zTP9@K;qLhc!xm%)CwT+Yr;@yJAA@G#OX>CEY;`!mM8!+&=RcOG<`D00NGgwuEUuwUQXLeje>ixhoD6vPvF*9lZq(pOZGw79;!Y+BW)uf z#@*yoQ_+Jnkg~Vin~9$^*rSwizD+Y23)hN{smm14jPLYFWI0iC+pCHwB5{KQsrsXu zUj*6Fc}?76&&k!ylQrk4GdN_wW6}pld#nPV!h6G}P}Z^y?qN^h|Dua-^ez$g+c^kp zdCv_beCm%xVrf7RK{$8KS!wASvsr9eF8Dil&{rSF9su zKInGd)mfvqAggz?qj1#y;Z-ryT{qO&J+Hya$SsVG0m^24=)k4U4<_gB3^sp2IIb3> zx03!v(4y=~AvPWUcMT)w1)It2^?^ikJjBx#ycoI_8iL8zD)VJ|xz{20w^{eY5vZxO zBY1BBa8RI}odR~~=)mrSO;5<#0@HmGP*7_{UJu3V#J29$(vrjjSC?EYutrQ;xC)OG}F3ac4;fKhMQpB_s1=cY9@u3 zfz06G>di>RWCiveOc<`{X9u6@e6tb0uXxvBjY)?rY1GO8oTA&=yd-QF(1Nx(Wc zwj?-QrViqw_Et}8Wn8oaUp4M@{H!7%;=HY_x*95{u&*^%i5Wcm4^|xzi&|rKB;cH(w;hY zEY*@z=CrM@Q9UsmgPEscFX`efsGt5AS`N@5zQXFP!tPIKNMN6#;gFlpoGKv#W%-!R zQ7ONk)g)S6NEv7(dXloPRB#X@dY!g+I-eyt-0yat9PlU84L z-(9zYl<&oj`{~ph?|{~OEtS=UTC|7W!{Xf)5znEXQ$r|Q^iSutyF;M8pRKcvECyl| zXjbPZ`I85KOiS#3(>P0Q5X)hsm&PtF;%6XPDao#7cFJpK+W=AAp`G)rSJNPb#v3?_JLLpw1ZpwH9T9nzoxeSjS<`iZG@wO zXFTYPFKv+tF{p0mAl04*+<^Uw^5RyLPVS$$uM`?c60-Gs`aG0=$l&Eg!^W={8{Eg|?`txp z^w|6pQR~zJ{t8ul2X&K4GdZqj@bhtJzlS#{QmAgKXT%8rtpGc?sh}OY9EmpHzBxJW zadq}aFTnCUrCB-kZO!}_#KJ1n9|%y1*0qYK&${nJJsj&n-G?e_v0XWG^~RN73l=u? z@a!LH_e*`vP@?iM1O-kX(96G;KPg;cpRy=``#*#V?*2Mj_f1~Ai)~c~8Ztb!U^V#9_nbmT_o`6$A&mF5)-2arz`3x}t)$t(>T~Dhi@$)C}7w4z1_+8fc=8<>0{PEpA+h?qldGhp>y#QXzYumpBxwQ-oMOL zgFltK6M239g7>0wmXx;kxks?26W&Cy8(Xt~?OfuzLfe$5bd_T(?pbUA-iJ)NS90cW ztLOLlGe1doeuvyTk5e-=VR{QzQCXHlsEoj#d(fl%Qb|idXKiV0gzpB2mY&3rt&^Es zn#3*XtgYj)_{rAmlj*6043omf7ll-@9Go07ogF2jC2*7%YX;gpA-K=Q)Gi2l2TfM4 z(sLf*bW+)>h6A_s75^{oDER&i=21-gc$gWmEROfQK11b(;kO7?m;H$vY)+59!3CtFdlr$_TrZ4 zvBnqIxiTzK`FCpc;%a)_TwN)m$jbC5dPi!)iO;^4N>(U$@!DGs+Gyujev=vX&1^}P zqFlG|3;3x|#Yv8NhDzJ8L9@-I3Mx3Ei}F`d#JXMr?P>W3pG46bu=VaVhgyatutjl7 z!q94*yH|^;0_`dYKL5NzUS}jv;qlB5R{>Z37 z!9f7oha#1)<7|K&W+RH)#GNKkQhDdn&M3_Sea`WaDziMdgw^+Y42|x@X(H9EujTcu z2aNZ3!d5i1k7xTWOx$svSd>ZeQN#*4Xb$wpD$l+|2YdIcN=+=A&9-&@6!GzOid3xh z9Zx%e3(7j7Id#EkcFqcH7JmQ%v)^nm59;Lg3j-k_*7`j zuz0w3yE<0j;}pVONG$$_tXt%^lZn!;1e0WwIaw&biUQ}CEt++GXoC=5rmlo<3&ZdK z3_*_p#Z#?#7ej4I5yE!!u1^asqbIYMdf5_$n9>O+uko^}<7pfi5;`b&$ zIMkU7D$1gS-O-rQW6o5~B0hck|Bg|dqc%|`ZF z=wOF0gPTUn_SU{>Bl)uPx9z(P3!nK6hcDz8H3V2aBDRF#On^Sr?TkZM-^kuugdHbS zB;Tu@4Wg=HjTlc@w)Uz6k@rzeUCEme`X$_u+}~wJ^4sThq#Ev6Jr>G`@?aHd4eqJO zvC`s|R^{9z9Oj)8M?A3$G_n3hG2{Puy2NX9s|bb3 ziJSb@of~t?!6pqVZwI^fBt{w6BN6ns8ahu~dv~%C_oqY9ETAA|vfBXLqZR4}2Wv+LV=Q3hT>>0(=wt2VgfAYIpF7Uv_*z3`lz zmb!@uY({qd-Tdw;Qo9Ww@F&Nu1AOPlJ*TJJXo&T-s%%q1QJ^AWy8ZJf?}AT{23~HL z^(@l6X0x?WEf7?2aSb#v2p5)#yV-YM(Yl4qzOqOs*A1h4RWxtX|AO9RTKiotq0Grs zaY}scP6V@uwOTSSZ~|+9L)lC@ehb{<@P7y%<4S~&7~T$e-C z^{H-F&FUS?j`XLD#EB81CE7IxhF=}OOw_++8DV`OE56t6HAQwh-(tM!XKB2hFMX|h zB2b>TDgaE~pXBf!G_F4|bvc}2=>@j*eEn3-W&R9YIm*dHYAX%aP}ilJEsAn!`?$T- z9BD1pdIpUfX!A1`Lv`x8ozN~y^-~MSwkPxCL)Bs(-c}DOWp(hI0GQJ{BmV*;B-jp) zB=qGRYgx%}?U`2Y!4+7OFc#n2UmX-Cn)_OlFI#0q-r7t8d%8<8ePRps7%xb$09!7z zX zmKi?2P&LW0;1p;ChMRL!4`&<8a!U)Cd3?5rCDMRt0b0^axsnBPR>* zgE*j%0L5noA30Fvmc)*70<(Q-iT}fDK4(&4m*LJ1n$BRoEhqAm zOW5X+RENnHUnauc8;)&Ug(8=X!^uzNIJqS~Nb z-myml1B9IK%b%LB2if*B+bwDF2`-beiBX$xe|w2JvEO$*QKvF(Kuyj!w{r`5`^~arBBz8* z#wIoz%xX%Hg|nYG$+%G${0IAyCN@PU^!5hP_3OKlnefZdo>R|o_*GzcmhbpoO}D{} zEGuVD5Qz_C5O>+qr^}Q%N^epnubg@L^yUZcrS`JV<#v^!I3|mv;eC~(^4^BuM2)J< z_JrzeyzHOQ$(0avy|YUb*C+3i#=wi-Z1g0eJ%)e*i85)kzQB`|bNn@k@;U+!&|V>e z8|b}^Gt@6?0d^SX$uoNJT%El|M_5QfR^C*}{me`1V7 zMYwYLK;!Mv8;7d5W@0t=-OZ$XYgi`f1G9-=nTU);`}Ut7`KK9-xf+rdyDKJ%yLk7G zH24PC5O`t@$OsjW=B0T?*g*yDldA!F_n~zl2TgnyGTym;yES^Y1J~GqIv>HE^Edwt ze66W}k(Y~osA5$!rT5afbY%Kaq4gULr01`T&B*c4W0!SRzZkC8c`&gYQw+;a+HiSG zN^-^YRpsaA*}pqxujR&Bt>ui2OWV=bSBIcBH&Qc&JhO_UoN_kjwaSPu_a;W9?W5nx zjJzt4l&7iEXh z9Y4Xx7`dbBM4JY~DKFy@Z<~sVe=kl;$XhryhAP#9ak;Ckv@7>z+mB^T2a~rD$ z4jhMz@TV?(PSLvzqFnSHu`=V1Q=3tMtj5%Oc@8L?`P}Y_o^>g(dN}VlPtk+*nj6KE ze9Gvg-e9NaB&yFB7ikrP`Vq|zB~8y#WL7Fl+33nI48DB9Oyxi3(0u!Nmv9BNFPfWS zRtebaJYUa&X~irpfU9E3;ZW)m&fxOUD@n-c89WEk34`-4B^gUB#kgXtDM^${NJ_#2 z8WS4@-rL-is2ZluwN|++r+t0f&Z}$!#SxsW6I^lV*b*C58>KV?Jy1)tDYS z$rp-v`R;9$aW~PGMl0_`K2s;@`+jz>o~INq&c(ykFw5oqMm;G)zxV8j-Lr-!e*l&B ztO+yOI$%{9*mDIsyWP<}b1(aKlH-ev-FEBT6ZjLn?EBf!nv~$h0{=k2+1dcHgoGQr z`SNpVKT&ljqOaPkID7hP&Ahi{CS{~}hA(tqXoZ+lu_V4Koz5U8{yo2yPq)K)#TT3! zhMol4Ul$-;c4~Mp2wCEqUkT zGsZ_k&mo=Tsgz^v*4GtS*(M2*(J=K3p){A%++m4owkWmbDxP7B>}*g=gCggcEwEc(z%oHWMs76pcITxczeG$6}h=5u`jF&NfN)#HGlYN7~kx)?apw z%eBzhPVb%nWY=Q2y7EQ*ckG{~huxKuR_`(zzU0j#yy_P<_D~R@=z^tJ3=Ngm_Xzme3v2YAb=VvX ztmw6+&`0(#FSnLf2*=9U5{vN?y2Lt<-|(d36HzktuvBJYD#yI0vQTEh!@TU%o&;Y{ z#yyokU-o<_!rC)2HeveC3=`OnM@K^l-9J;9KNxt#IUwD`csBwsySM+=oH*`hoSuaU z2A22y>=JdnnceYleg73Z+Pi8M?Be9shdbd=xy(XOp>8aGfemI$x2(0na`By((FVnt zb4aKxYm;yMk@~d2C%e|n{MFw|%YSXa_0e^@rCJYW1kk3KA*gl46ZrNu5^j})K?p+r zNro^%ndHuLLTj-|l`;oEhV&3nSg;4p9CgL278LU&i)woyiYwx4pdTas>5jR?-=G!p z67Z?t!%{S*2I2>5Ei7s$18NP0J=|GbHnsm_s?VOiH`$0s+eE%dCZ^VFJ2d}s_Xpj2 z6$OtHfxhLJ%zpb*i&vLy%*riGxYOU>zjp1cAq2C?7=e;q^JbRmR~~|9`8{73ifgQ> zK+Ql)67zK@6xBXkD>x5a{+mdsG&vw|@Z2D}`G2{7#E&V_Yf;yCg%u_ImYr>CIvKoQ z&1IR)6)ycxQu3d^#Jp4LXxYV*S zkm}g7wOhapM0Lx+Z2_77VYKVJv%foxihn;OW30{`C3fA93$pUnpMRf)ydk!~i35#1 zEHBscPx_JsgXR)vg{Y3Ectx>}GcZ|bA?%>I#}EFn@C{vb{j!KLoU-*PxC}VGbS)6) zy3s-6DL|4>2S>pX2f((;=lxn#f!yox;o#$qR=d0^Vbiicl`r4$Qg{logs&?t4PI%L zgIq#N==!sU=tZbuuL>wicz3_MDQEhp5E%a!nv867AaTP$jW|x_I9ZEBE3mCbkOi** zsMng?t{f`(rYeRVH674OYfPHVty0;x(?$AEZIoQ>0GalN{9`VHwr9|MF5iXu?aZTYXXp;UJST4s+w+Dbr|)=?%T&kH%LPkxPp-c^X$@Y1s9 zv1QK{9NNps4Avj9ZUU(`3H%H&oad3|Um`+_AnzjnOg&p^I`YVOuW)zht_p~e3vm3= z7}nfD@9bI12&NP=yP)x5cHqJ#>F|=fBylZ~jn`uIVJhLW-%JIWExNW7Ijtzg*A*o6 zHv+kQxy5_LJ+Lj+m1#f-hVA{*LZ66+mclBqQ(>^O?Cr3s?fJ@?MFLf)#Iu zpn`saLicT<6R3IKtL=PA50x!r2Y>N*FJmZE92XRl+I8(!xF5m#@S|ERZ9pnZe)!aL zD(^sVZhaU}C^{C8Sh*!EVTP`+YAY==EmXRFU~*W+5hXB=uub5Gq#F>tMgL&U-3M^% zh%H$qnmj+dfd0)ri@j3`)1-&YEBhZW*GRui4o&D;Y5=h-Xv80!xvK4+>hvpQ?0mg(FNttE;z`kj9F`ZBZM*KJNr0n%9yXqpnUo2sDuB}P`tXlWmJ zF<$Ibc@k37hWy19V1pJZWWWClJilxMf*7%V*Uov*^|#$~6J9szDR)K|J3MXh4oo*l zXQJ^Pi&Z3UM4bQiTc)MWFXPb$KR(qKO&E_ki6YbSF^2OqFD4f}rEwt=~%QjtJxeE8()sBRF0jRv{Jj2_5C6jfjkg z-FvRTT^8~NYs~s%Z%mT|l)jV_`a>jy3TE}l{Hv=ZhP~etL2JJH7~R;wGnAAJtf=Ka zT)o=_(jzsU3(n-V@Y`BF&h|?P(@p37pX14bL-=;s79$xg-lE-VQusIrRzmZ64fbDbLAuH}d8z_&D%?DGzq|WR5pF(OE<9ko9V~2BgxQRdXq7tZmw$yrwa0 z=d1mQ_yqrec6HsQlt4K>xZ+VT4 zy9EFn=pm@$>2v5=$hW)P`$0vhpogG;s-iCtU5yi)^3LLj2n)}Em4k&fKXa5Xn@QxA zI_CK=fJI|@%lCMcIQ;}Pe|n{fr{DOC;kpa(K_s;6xUgPZ_Bu5TxMA}#rECBo%+~%3 z3_Qgvlc3ChZi;uPmGIvgR}F35K*$~;b=LLmB%9Z@Cm^3>|AFKZzTC=Ni$+kn-cwMO zK)Eu31&w5G&kr+4b^%RhDJ=12ybmh!O~T3+2{~7vz)MvObhmb-sj<=sU<^3|F$P>1 zJ|o@W?(+oB^sf*-%Np1n;0&B5bgR?Qa%or1n8MHR`-IJ$-}L*#?|jb%7#DAziY8w^ zp$wRc1e=W{J#O@+)3;*BY6>3on!vHFF5GF1pImj$KlDP?oyc3~h5x4!VC=n^Y{YTl zDk>TX+1pJ|ECDC>h0myzJZAKXqc4xd+N!$+_403?I_-$(4!r>AHy)|T_;Ic*^vr5VMH_=`QfQ~y`jglpVq>+0Hpj7D3{&7R=H6O z;z9uFPtaCasQGIO%fz)z(!&u(i%fi5g@YXYK^Xy&au)>da+*$}U&ceOW}l6ZZ=B+_ zh9%mAZIchcQc;h~(9mVM;%F{A}4MV1zDmibkZdvgC7)h4aj*ePWCyPYy_WoNeSkmZ0cXb}p zKb%#*8heYE$U`tAR=7!^g~RF~dPK&i^)}Bfh56jhT}X}fgE)^7UNx8fSjPq^_jp=; zLcm=8+a^U?ugyo>G3(-CF23k`BwYj{-D`leL zk$E6$QQt=HWZUwWrZL~Bac?Q0FNHyGbsh@d2(8VD^NiSjAe`a<;xxiu#Xa=-JwB5d8Sxl70JocL1k1Izr~m<3>)YJ!-(zJ~b1HWVvm z#x=nA(e_a5FgrN)I)B7q!HR4HMzN@U^eWM+vc?!>3;F5P>avH6{HTVvU6#9ADhir# z(lTI>C_S~9;6L~Addo|rq?(XK1Kmo(iVZiiFY_+*-hR_*MCIq*xO`o)J?L@c3{3If zpefXAfVS3j^DVQs1GnVsGDkn7`8+>Vd({tjNOX9oz)Y$sA6rthzcKBFD&-hlz{Vn3 zx1ovG?__blEDH<1)?dsMz-1I8;!ZIB{A8l`KM|+~K?^cAK}P?h_1Hniv#_5F zJs8;N86Ggluk%|+TLk6kV!~)6ox`-h@xU`~$|Rvq2|{H|%gt|jaSkf_Aa;>Syd;i> z)X~b{sf0YMDl%HU9M&o%TWzgx#zUk@NiV()h5pFsNNmF(0RjP-DuEN$esJ%zRp26* z&aM;uBl?G90rql6v{PXepGvV>UQuT@6;2o8<4sE-<6# Date: Thu, 9 Sep 2021 16:13:16 +0100 Subject: [PATCH 17/78] [AP-1059] downgrade setuptools to support use_2to3 (#796) * downgrade setuptools to support use_2to3 * fixed install method --- install.sh | 3 +++ singer-connectors/tap-adwords/pre_requirements.txt | 3 +++ 2 files changed, 6 insertions(+) create mode 100644 singer-connectors/tap-adwords/pre_requirements.txt diff --git a/install.sh b/install.sh index 36b25d2a4..6c671573a 100755 --- a/install.sh +++ b/install.sh @@ -61,6 +61,9 @@ make_virtualenv() { source $VENV_DIR/$1/bin/activate python3 -m pip install --upgrade pip setuptools wheel + if [ -f "pre_requirements.txt" ]; then + python3 -m pip install --upgrade -r pre_requirements.txt + fi if [ -f "requirements.txt" ]; then python3 -m pip install --upgrade -r requirements.txt fi diff --git a/singer-connectors/tap-adwords/pre_requirements.txt b/singer-connectors/tap-adwords/pre_requirements.txt new file mode 100644 index 000000000..3258b5277 --- /dev/null +++ b/singer-connectors/tap-adwords/pre_requirements.txt @@ -0,0 +1,3 @@ +# setuptools>58.0.0 is not compatible with googleads==17.0.0 +# remove this file whenever tap-adwards upgrades googleads version +setuptools<=57.0.5 \ No newline at end of file From acde3c18f4dd116113f47f3dc50d31fdcf59a1d6 Mon Sep 17 00:00:00 2001 From: Amir Mofakhar Date: Tue, 14 Sep 2021 08:08:40 +0100 Subject: [PATCH 18/78] [AP-1054] fixed pep8 and added github workflow for pep8 check (#793) --- .github/workflows/linter.yml | 7 + pipelinewise/cli/__init__.py | 95 ++- .../cli/alert_handlers/base_alert_handler.py | 1 + pipelinewise/cli/alert_handlers/errors.py | 3 + .../cli/alert_handlers/slack_alert_handler.py | 29 +- .../alert_handlers/victorops_alert_handler.py | 38 +- pipelinewise/cli/alert_sender.py | 45 +- pipelinewise/cli/commands.py | 148 ++-- pipelinewise/cli/config.py | 232 +++-- pipelinewise/cli/constants.py | 1 + pipelinewise/cli/errors.py | 6 +- pipelinewise/cli/pipelinewise.py | 776 +++++++++++------ pipelinewise/cli/tap_properties.py | 69 +- pipelinewise/cli/utils.py | 60 +- pipelinewise/fastsync/commons/errors.py | 3 + pipelinewise/fastsync/commons/split_gzip.py | 49 +- pipelinewise/fastsync/commons/tap_mongodb.py | 210 +++-- pipelinewise/fastsync/commons/tap_mysql.py | 184 ++-- pipelinewise/fastsync/commons/tap_postgres.py | 132 ++- pipelinewise/fastsync/commons/tap_s3_csv.py | 198 +++-- .../fastsync/commons/target_bigquery.py | 139 ++- .../fastsync/commons/target_postgres.py | 141 ++- .../fastsync/commons/target_redshift.py | 195 +++-- .../fastsync/commons/target_snowflake.py | 309 ++++--- .../fastsync/commons/transform_utils.py | 149 ++-- pipelinewise/fastsync/commons/utils.py | 111 ++- pipelinewise/fastsync/mongodb_to_bigquery.py | 48 +- pipelinewise/fastsync/mongodb_to_postgres.py | 55 +- pipelinewise/fastsync/mongodb_to_snowflake.py | 48 +- pipelinewise/fastsync/mysql_to_bigquery.py | 111 +-- pipelinewise/fastsync/mysql_to_postgres.py | 57 +- pipelinewise/fastsync/mysql_to_redshift.py | 61 +- pipelinewise/fastsync/mysql_to_snowflake.py | 71 +- pipelinewise/fastsync/postgres_to_bigquery.py | 118 +-- pipelinewise/fastsync/postgres_to_postgres.py | 66 +- pipelinewise/fastsync/postgres_to_redshift.py | 59 +- .../fastsync/postgres_to_snowflake.py | 65 +- pipelinewise/fastsync/s3_csv_to_bigquery.py | 63 +- pipelinewise/fastsync/s3_csv_to_postgres.py | 69 +- pipelinewise/fastsync/s3_csv_to_redshift.py | 71 +- pipelinewise/fastsync/s3_csv_to_snowflake.py | 70 +- pipelinewise/logger.py | 1 + pipelinewise/utils.py | 4 +- setup.py | 1 + tests/end_to_end/helpers/assertions.py | 143 +++- tests/end_to_end/helpers/db.py | 83 +- tests/end_to_end/helpers/env.py | 446 ++++++---- tests/end_to_end/helpers/tasks.py | 12 +- tests/end_to_end/test_target_bigquery.py | 345 +++++--- tests/end_to_end/test_target_postgres.py | 352 +++++--- tests/end_to_end/test_target_redshift.py | 152 +++- tests/end_to_end/test_target_snowflake.py | 427 +++++++--- tests/units/cli/cli_args.py | 27 +- tests/units/cli/test_alert_sender.py | 109 ++- tests/units/cli/test_cli.py | 481 ++++++++--- tests/units/cli/test_cli_utils.py | 5 +- tests/units/cli/test_cli_utils_tap_github.py | 8 +- tests/units/cli/test_commands.py | 802 +++++++++++------- tests/units/cli/test_config.py | 263 +++--- tests/units/fastsync/assertions.py | 177 ++-- .../commons/test_fastsync_tap_mongodb.py | 208 +++-- .../commons/test_fastsync_tap_mysql.py | 40 +- .../commons/test_fastsync_tap_postgres.py | 101 ++- .../commons/test_fastsync_tap_s3_csv.py | 318 ++++--- .../commons/test_fastsync_target_bigquery.py | 223 +++-- .../commons/test_fastsync_target_postgres.py | 233 ++--- .../commons/test_fastsync_target_snowflake.py | 368 ++++---- .../fastsync/commons/test_fastsync_utils.py | 457 +++++----- .../units/fastsync/commons/test_split_gzip.py | 32 +- .../fastsync/commons/test_transform_utils.py | 274 +++--- .../fastsync/test_mongodb_to_bigquery.py | 27 +- .../fastsync/test_mongodb_to_postgres.py | 31 +- .../fastsync/test_mongodb_to_snowflake.py | 31 +- .../units/fastsync/test_mysql_to_bigquery.py | 31 +- .../units/fastsync/test_mysql_to_postgres.py | 35 +- .../units/fastsync/test_mysql_to_redshift.py | 75 +- .../units/fastsync/test_mysql_to_snowflake.py | 35 +- .../fastsync/test_postgres_to_bigquery.py | 27 +- .../fastsync/test_postgres_to_postgres.py | 31 +- .../fastsync/test_postgres_to_redshift.py | 35 +- .../fastsync/test_postgres_to_snowflake.py | 31 +- .../units/fastsync/test_s3_csv_to_bigquery.py | 27 +- .../units/fastsync/test_s3_csv_to_postgres.py | 31 +- .../units/fastsync/test_s3_csv_to_redshift.py | 31 +- .../fastsync/test_s3_csv_to_snowflake.py | 31 +- tests/units/test_logger.py | 9 +- tests/units/test_utils.py | 2 + 87 files changed, 7043 insertions(+), 3931 deletions(-) diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml index 568ad7e1b..e287748ca 100644 --- a/.github/workflows/linter.yml +++ b/.github/workflows/linter.yml @@ -46,3 +46,10 @@ jobs: - name: Pylinting if: steps.check.outcome == 'failure' run: pylint pipelinewise tests + + - name: Pep8 + if: steps.check.outcome == 'failure' + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + flake8 . --count --max-complexity=15 --max-line-length=120 --statistics diff --git a/pipelinewise/cli/__init__.py b/pipelinewise/cli/__init__.py index fd2af924d..221205993 100644 --- a/pipelinewise/cli/__init__.py +++ b/pipelinewise/cli/__init__.py @@ -22,7 +22,9 @@ CONFIG_DIR = os.path.join(USER_HOME, '.pipelinewise') PROFILING_DIR = os.path.join(CONFIG_DIR, 'profiling') PIPELINEWISE_DEFAULT_HOME = os.path.join(USER_HOME, 'pipelinewise') -PIPELINEWISE_HOME = os.path.abspath(os.environ.setdefault('PIPELINEWISE_HOME', PIPELINEWISE_DEFAULT_HOME)) +PIPELINEWISE_HOME = os.path.abspath( + os.environ.setdefault('PIPELINEWISE_HOME', PIPELINEWISE_DEFAULT_HOME) +) VENV_DIR = os.path.join(PIPELINEWISE_HOME, '.virtualenvs') COMMANDS = [ 'init', @@ -61,7 +63,9 @@ def __init_logger(log_file=None, debug=False): return logger -def __init_profiler(profiler_arg: bool, logger: logging.Logger) -> Tuple[Optional[Profile], Optional[str]]: +def __init_profiler( + profiler_arg: bool, logger: logging.Logger +) -> Tuple[Optional[Profile], Optional[str]]: """ Initialise profiling environment by creating a cprofile.Profiler instance, a folder where pstats can be dumped Args: @@ -82,9 +86,10 @@ def __init_profiler(profiler_arg: bool, logger: logging.Logger) -> Tuple[Optiona logger.debug('Profiler created.') - profiling_dir = os.path.join(PROFILING_DIR, - f'{datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")}_{generate_random_string(10)}' - ) + profiling_dir = os.path.join( + PROFILING_DIR, + f'{datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")}_{generate_random_string(10)}', + ) try: os.makedirs(profiling_dir) @@ -103,10 +108,12 @@ def __init_profiler(profiler_arg: bool, logger: logging.Logger) -> Tuple[Optiona return None, None -def __disable_profiler(profiler: Optional[Profile], - profiling_dir: Optional[str], - pstat_filename: Optional[str], - logger: logging.Logger): +def __disable_profiler( + profiler: Optional[Profile], + profiling_dir: Optional[str], + pstat_filename: Optional[str], + logger: logging.Logger, +): """ Disable given profiler and dump pipelinewise stats into a pStat file Args: @@ -145,32 +152,42 @@ def main(): parser.add_argument('--target', type=str, default='*', help='"Name of the target') parser.add_argument('--tap', type=str, default='*', help='Name of the tap') parser.add_argument('--tables', type=str, help='List of tables to sync') - parser.add_argument('--dir', type=str, default='*', help='Path to directory with config') + parser.add_argument( + '--dir', type=str, default='*', help='Path to directory with config' + ) parser.add_argument('--name', type=str, default='*', help='Name of the project') parser.add_argument('--secret', type=str, help='Path to vault password file') parser.add_argument('--string', type=str) - parser.add_argument('--version', - action='version', - help='Displays the installed versions', - version='PipelineWise {} - Command Line Interface'.format(__version__)) + parser.add_argument( + '--version', + action='version', + help='Displays the installed versions', + version='PipelineWise {} - Command Line Interface'.format(__version__), + ) parser.add_argument('--log', type=str, default='*', help='File to log into') - parser.add_argument('--extra_log', - default=False, - required=False, - help='Copy singer and fastsync logging into PipelineWise logger', - action='store_true') - parser.add_argument('--debug', - default=False, - required=False, - help='Forces the debug mode with logging on stdout and log level debug', - action='store_true') - parser.add_argument('--profiler', '-p', - default=False, - required=False, - help='Enables code profiling mode using Python builtin profiler cProfile. ' - 'The stats will be dumped into a folder in .pipelinewise/profiling', - action='store_true' - ) + parser.add_argument( + '--extra_log', + default=False, + required=False, + help='Copy singer and fastsync logging into PipelineWise logger', + action='store_true', + ) + parser.add_argument( + '--debug', + default=False, + required=False, + help='Forces the debug mode with logging on stdout and log level debug', + action='store_true', + ) + parser.add_argument( + '--profiler', + '-p', + default=False, + required=False, + help='Enables code profiling mode using Python builtin profiler cProfile. ' + 'The stats will be dumped into a folder in .pipelinewise/profiling', + action='store_true', + ) args = parser.parse_args() @@ -201,7 +218,9 @@ def main(): # import_config : this is for backward compatibility; use 'import' instead from CLI if args.command == 'import' or args.command == 'import_config': if args.dir == '*': - print('You must specify a directory path with config YAML files using the argument --dir') + print( + 'You must specify a directory path with config YAML files using the argument --dir' + ) sys.exit(1) # Every command argument is mapped to a python function with the same name, but 'import' is a @@ -209,12 +228,16 @@ def main(): args.command = 'import_project' if args.command == 'validate' and args.dir == '*': - print('You must specify a directory path with config YAML files using the argument --dir') + print( + 'You must specify a directory path with config YAML files using the argument --dir' + ) sys.exit(1) if args.command == 'encrypt_string': if not args.secret: - print('You must specify a path to a file with vault secret using the argument --secret') + print( + 'You must specify a path to a file with vault secret using the argument --secret' + ) sys.exit(1) if not args.string: print('You must specify a string to encrypt using the argument --string') @@ -229,7 +252,9 @@ def main(): try: getattr(ppw_instance, args.command)() finally: - __disable_profiler(profiler, profiling_dir, f'pipelinewise_{args.command}', logger) + __disable_profiler( + profiler, profiling_dir, f'pipelinewise_{args.command}', logger + ) if __name__ == '__main__': diff --git a/pipelinewise/cli/alert_handlers/base_alert_handler.py b/pipelinewise/cli/alert_handlers/base_alert_handler.py index 91bdf6033..2e54d7a83 100644 --- a/pipelinewise/cli/alert_handlers/base_alert_handler.py +++ b/pipelinewise/cli/alert_handlers/base_alert_handler.py @@ -9,6 +9,7 @@ class BaseAlertHandler(ABC): """ Abstract base class for alert handlers """ + LOG = 'log' INFO = 'info' WARNING = 'warning' diff --git a/pipelinewise/cli/alert_handlers/errors.py b/pipelinewise/cli/alert_handlers/errors.py index fdc1841d9..15c2d7a11 100644 --- a/pipelinewise/cli/alert_handlers/errors.py +++ b/pipelinewise/cli/alert_handlers/errors.py @@ -7,6 +7,7 @@ class NotImplementedAlertHandlerException(Exception): """ Exception to raise when attempted to use a not implemented alert handler class """ + def __init__(self, *args, **kwargs): super().__init__(self, *args, **kwargs) @@ -15,6 +16,7 @@ class NotConfiguredAlertHandlerException(Exception): """ Exception to raise when attempted to use a not configured alert handler """ + def __init__(self, *args, **kwargs): super().__init__(self, *args, **kwargs) @@ -23,5 +25,6 @@ class InvalidAlertHandlerException(Exception): """ Exception to raise when alert handler not configured correctly """ + def __init__(self, *args, **kwargs): super().__init__(self, *args, **kwargs) diff --git a/pipelinewise/cli/alert_handlers/slack_alert_handler.py b/pipelinewise/cli/alert_handlers/slack_alert_handler.py index f15be1eec..81c604a26 100644 --- a/pipelinewise/cli/alert_handlers/slack_alert_handler.py +++ b/pipelinewise/cli/alert_handlers/slack_alert_handler.py @@ -11,7 +11,7 @@ BaseAlertHandler.LOG: '36C5F0', BaseAlertHandler.INFO: 'good', BaseAlertHandler.WARNING: 'warning', - BaseAlertHandler.ERROR: 'danger' + BaseAlertHandler.ERROR: 'danger', } @@ -20,6 +20,7 @@ class SlackAlertHandler(BaseAlertHandler): """ Slack Alert Handler class """ + def __init__(self, config: dict) -> None: if config is not None: if 'token' not in config: @@ -27,7 +28,9 @@ def __init__(self, config: dict) -> None: self.token = config['token'] if 'channel' not in config: - raise InvalidAlertHandlerException('Missing channel in Slack connection') + raise InvalidAlertHandlerException( + 'Missing channel in Slack connection' + ) self.channel = config['channel'] else: @@ -35,7 +38,9 @@ def __init__(self, config: dict) -> None: self.client = WebClient(self.token) - def send(self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception = None) -> None: + def send( + self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception = None + ) -> None: """ Send alert @@ -47,9 +52,15 @@ def send(self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception Returns: Initialised alert handler object """ - self.client.chat_postMessage(channel=self.channel, - text=f'```{exc}```' if exc else None, - attachments=[{ - 'color': ALERT_LEVEL_SLACK_COLORS.get(level, BaseAlertHandler.ERROR), - 'title': message - }]) + self.client.chat_postMessage( + channel=self.channel, + text=f'```{exc}```' if exc else None, + attachments=[ + { + 'color': ALERT_LEVEL_SLACK_COLORS.get( + level, BaseAlertHandler.ERROR + ), + 'title': message, + } + ], + ) diff --git a/pipelinewise/cli/alert_handlers/victorops_alert_handler.py b/pipelinewise/cli/alert_handlers/victorops_alert_handler.py index 0713bff60..92ba39fee 100644 --- a/pipelinewise/cli/alert_handlers/victorops_alert_handler.py +++ b/pipelinewise/cli/alert_handlers/victorops_alert_handler.py @@ -12,7 +12,7 @@ BaseAlertHandler.LOG: 'INFO', BaseAlertHandler.INFO: 'INFO', BaseAlertHandler.WARNING: 'WARNING', - BaseAlertHandler.ERROR: 'CRITICAL' + BaseAlertHandler.ERROR: 'CRITICAL', } @@ -21,20 +21,27 @@ class VictoropsAlertHandler(BaseAlertHandler): """ VictorOps Alert Handler class """ + def __init__(self, config: dict) -> None: if config is not None: if 'base_url' not in config: - raise InvalidAlertHandlerException('Missing REST Endpoint URL in VictorOps connection') + raise InvalidAlertHandlerException( + 'Missing REST Endpoint URL in VictorOps connection' + ) self.base_url = config['base_url'] if 'routing_key' not in config: - raise InvalidAlertHandlerException('Missing routing key in VictorOps connection') + raise InvalidAlertHandlerException( + 'Missing routing key in VictorOps connection' + ) self.routing_key = config['routing_key'] else: raise InvalidAlertHandlerException('No valid VictorOps config supplied.') - def send(self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception = None) -> None: + def send( + self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception = None + ) -> None: """ Send alert @@ -49,13 +56,22 @@ def send(self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception # Send alert to VictorOps REST Endpoint as a HTTP post request response = requests.post( f'{self.base_url}/{self.routing_key}', - data=json.dumps({ - 'message_type': ALERT_LEVEL_MESSAGE_TYPES.get(level, BaseAlertHandler.ERROR), - 'entity_display_name': message, - 'state_message': exc}), - headers={'Content-Type': 'application/json'}) + data=json.dumps( + { + 'message_type': ALERT_LEVEL_MESSAGE_TYPES.get( + level, BaseAlertHandler.ERROR + ), + 'entity_display_name': message, + 'state_message': exc, + } + ), + headers={'Content-Type': 'application/json'}, + ) # Success victorops message should return 200 if response.status_code != 200: - raise ValueError('Request to victorops returned an error {}. {}'.format(response.status_code, - response.text)) + raise ValueError( + 'Request to victorops returned an error {}. {}'.format( + response.status_code, response.text + ) + ) diff --git a/pipelinewise/cli/alert_sender.py b/pipelinewise/cli/alert_sender.py index 88a4ce7e9..847f3f232 100644 --- a/pipelinewise/cli/alert_sender.py +++ b/pipelinewise/cli/alert_sender.py @@ -23,7 +23,7 @@ # Every alert handler class needs to implement the BaseAlertHandler base class ALERT_HANDLER_TYPES_TO_CLASS = { 'slack': SlackAlertHandler, - 'victorops': VictoropsAlertHandler + 'victorops': VictoropsAlertHandler, } @@ -45,7 +45,9 @@ def __init__(self, alert_handlers: Dict = None) -> None: # Raise an exception if alert_handlers is not a dictionary if not isinstance(self.alert_handlers, dict): - raise InvalidAlertHandlerException('alert_handlers needs to be a dictionary') + raise InvalidAlertHandlerException( + 'alert_handlers needs to be a dictionary' + ) @staticmethod def __init_handler_class(alert_handler: AlertHandler) -> BaseAlertHandler: @@ -63,8 +65,9 @@ def __init_handler_class(alert_handler: AlertHandler) -> BaseAlertHandler: alert_handler_class = ALERT_HANDLER_TYPES_TO_CLASS[alert_handler.type] handler = alert_handler_class(alert_handler.config) except KeyError as key_error: - raise NotImplementedAlertHandlerException(f'Alert handler type not implemented: {alert_handler.type}') \ - from key_error + raise NotImplementedAlertHandlerException( + f'Alert handler type not implemented: {alert_handler.type}' + ) from key_error return handler @@ -80,16 +83,22 @@ def __get_alert_handler(self, alert_handler_type: str) -> AlertHandler: """ if alert_handler_type in self.alert_handlers: alert_handler_config = self.alert_handlers[alert_handler_type] - alert_handler = AlertHandler(type=alert_handler_type, config=alert_handler_config) + alert_handler = AlertHandler( + type=alert_handler_type, config=alert_handler_config + ) return alert_handler - raise NotConfiguredAlertHandlerException(f'Alert handler type not configured: {alert_handler_type}') - - def send_to_handler(self, - alert_handler_type: str, - message: str, - level: str = BaseAlertHandler.ERROR, - exc: Exception = None) -> bool: + raise NotConfiguredAlertHandlerException( + f'Alert handler type not configured: {alert_handler_type}' + ) + + def send_to_handler( + self, + alert_handler_type: str, + message: str, + level: str = BaseAlertHandler.ERROR, + exc: Exception = None, + ) -> bool: """ Sends an alert message to a specific alert handler type @@ -112,10 +121,9 @@ def send_to_handler(self, # Alert sent successfully return True - def send_to_all_handlers(self, - message: str, - level: str = BaseAlertHandler.ERROR, - exc: Exception = None) -> dict: + def send_to_all_handlers( + self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception = None + ) -> dict: """ Get all the configured alert handlers and send alert message to all of them @@ -128,5 +136,8 @@ def send_to_all_handlers(self, Returns: Dictionary with number of successfully sent alerts """ - sents = [self.send_to_handler(handler_type, message, level, exc) for handler_type in self.alert_handlers] + sents = [ + self.send_to_handler(handler_type, message, level, exc) + for handler_type in self.alert_handlers + ] return {'sent': len(sents)} diff --git a/pipelinewise/cli/commands.py b/pipelinewise/cli/commands.py index e824987cb..9537eb7c0 100644 --- a/pipelinewise/cli/commands.py +++ b/pipelinewise/cli/commands.py @@ -20,15 +20,20 @@ STATUS_FAILED = 'failed' STATUS_SUCCESS = 'success' -TapParams = namedtuple('TapParams', ['id', 'type', 'bin', 'python_bin', 'config', 'properties', 'state']) +TapParams = namedtuple( + 'TapParams', ['id', 'type', 'bin', 'python_bin', 'config', 'properties', 'state'] +) TargetParams = namedtuple('TargetParams', ['id', 'type', 'bin', 'python_bin', 'config']) -TransformParams = namedtuple('TransformParams', ['bin', 'python_bin', 'config', 'tap_id', 'target_id']) +TransformParams = namedtuple( + 'TransformParams', ['bin', 'python_bin', 'config', 'tap_id', 'target_id'] +) class RunCommandException(Exception): """ Custom exception to raise when run command fails """ + def __init__(self, *args, **kwargs): Exception.__init__(self, *args, **kwargs) @@ -56,9 +61,9 @@ def exists_and_executable(bin_path: str) -> bool: return True -def build_tap_command(tap: TapParams, - profiling_mode: bool = False, - profiling_dir: str = None) -> str: +def build_tap_command( + tap: TapParams, profiling_mode: bool = False, profiling_dir: str = None +) -> str: """ Builds a command that starts a singer tap connector with the required command line arguments @@ -73,7 +78,9 @@ def build_tap_command(tap: TapParams, # Following the singer spec the catalog JSON file needs to be passed by the --catalog argument # However some tap (i.e. tap-mysql and tap-postgres) requires it as --properties # This is probably for historical reasons and need to clarify on Singer slack channels - catalog_argument = utils.get_tap_property_by_tap_type(tap.type, 'tap_catalog_argument') + catalog_argument = utils.get_tap_property_by_tap_type( + tap.type, 'tap_catalog_argument' + ) state_arg = '' if tap.state and os.path.isfile(tap.state): @@ -88,9 +95,9 @@ def build_tap_command(tap: TapParams, return tap_command -def build_target_command(target: TargetParams, - profiling_mode: bool = False, - profiling_dir: str = None) -> str: +def build_target_command( + target: TargetParams, profiling_mode: bool = False, profiling_dir: str = None +) -> str: """ Builds a command that starts a singer target connector with the required command line arguments @@ -107,14 +114,16 @@ def build_target_command(target: TargetParams, if profiling_mode: dump_file = os.path.join(profiling_dir, f'target_{target.id}.pstat') - target_command = f'{target.python_bin} -m cProfile -o {dump_file} {target_command}' + target_command = ( + f'{target.python_bin} -m cProfile -o {dump_file} {target_command}' + ) return target_command -def build_transformation_command(transform: TransformParams, - profiling_mode: bool = False, - profiling_dir: str = None) -> str: +def build_transformation_command( + transform: TransformParams, profiling_mode: bool = False, profiling_dir: str = None +) -> str: """ Builds a command that starts a singer transformation connector with the required command line arguments @@ -138,16 +147,21 @@ def build_transformation_command(transform: TransformParams, if profiling_mode: dump_file = os.path.join( profiling_dir, - f'transformation_{transform.tap_id}_{transform.target_id}.pstat') + f'transformation_{transform.tap_id}_{transform.target_id}.pstat', + ) - trans_command = f'{transform.python_bin} -m cProfile -o {dump_file} {trans_command}' + trans_command = ( + f'{transform.python_bin} -m cProfile -o {dump_file} {trans_command}' + ) return trans_command -def build_stream_buffer_command(buffer_size: int = 0, - log_file: str = None, - stream_buffer_bin: str = DEFAULT_STREAM_BUFFER_BIN) -> str: +def build_stream_buffer_command( + buffer_size: int = 0, + log_file: str = None, + stream_buffer_bin: str = DEFAULT_STREAM_BUFFER_BIN, +) -> str: """ Builds a command that buffers data between tap and target connectors to stream data asynchronously. Buffering streams @@ -188,11 +202,15 @@ def build_stream_buffer_command(buffer_size: int = 0, return buffer_command -def build_singer_command(tap: TapParams, target: TargetParams, transform: TransformParams, - stream_buffer_size: int = 0, - stream_buffer_log_file: str = None, - profiling_mode: bool = False, - profiling_dir: str = None) -> str: +def build_singer_command( + tap: TapParams, + target: TargetParams, + transform: TransformParams, + stream_buffer_size: int = 0, + stream_buffer_log_file: str = None, + profiling_mode: bool = False, + profiling_dir: str = None, +) -> str: """ Builds a command that starts a full singer command with tap, target and optional transformation connectors. The connectors are @@ -211,46 +229,49 @@ def build_singer_command(tap: TapParams, target: TargetParams, transform: Transf Returns: string of command line executable """ - tap_command = build_tap_command(tap, - profiling_mode, - profiling_dir) + tap_command = build_tap_command(tap, profiling_mode, profiling_dir) LOGGER.debug('Tap command: %s', tap_command) - target_command = build_target_command(target, - profiling_mode, - profiling_dir) + target_command = build_target_command(target, profiling_mode, profiling_dir) LOGGER.debug('Target command: %s', target_command) - transformation_command = build_transformation_command(transform, - profiling_mode, - profiling_dir) + transformation_command = build_transformation_command( + transform, profiling_mode, profiling_dir + ) LOGGER.debug('Transformation command: %s', transformation_command) - stream_buffer_command = build_stream_buffer_command(stream_buffer_size, - stream_buffer_log_file) + stream_buffer_command = build_stream_buffer_command( + stream_buffer_size, stream_buffer_log_file + ) LOGGER.debug('Buffer command: %s', stream_buffer_command) # Generate the final piped command with all the required components - sub_commands = [tap_command, transformation_command, stream_buffer_command, target_command] + sub_commands = [ + tap_command, + transformation_command, + stream_buffer_command, + target_command, + ] command = ' | '.join(list(filter(None, sub_commands))) return command # pylint: disable=too-many-arguments -def build_fastsync_command(tap: TapParams, - target: TargetParams, - transform: TransformParams, - venv_dir: str, - temp_dir: str, - tables: str = None, - profiling_mode: bool = False, - profiling_dir: str = None, - drop_pg_slot: bool = False - ) -> str: +def build_fastsync_command( + tap: TapParams, + target: TargetParams, + transform: TransformParams, + venv_dir: str, + temp_dir: str, + tables: str = None, + profiling_mode: bool = False, + profiling_dir: str = None, + drop_pg_slot: bool = False, +) -> str: """ Builds a command that starts fastsync from a given tap to a given target with optional transformations. @@ -273,16 +294,25 @@ def build_fastsync_command(tap: TapParams, fastsync_bin = utils.get_fastsync_bin(venv_dir, tap.type, target.type) ppw_python_bin = utils.get_pipelinewise_python_bin(venv_dir) - command_args = ' '.join(list(filter(None, [ - f'--tap {tap.config}', - f'--properties {tap.properties}', - f'--state {tap.state}', - f'--target {target.config}', - f'--temp_dir {temp_dir}', - f'--transform {transform.config}' if transform.config and os.path.isfile(transform.config) else '', - f'--tables {tables}' if tables else '', - '--drop_pg_slot' if drop_pg_slot else '', - ]))) + command_args = ' '.join( + list( + filter( + None, + [ + f'--tap {tap.config}', + f'--properties {tap.properties}', + f'--state {tap.state}', + f'--target {target.config}', + f'--temp_dir {temp_dir}', + f'--transform {transform.config}' + if transform.config and os.path.isfile(transform.config) + else '', + f'--tables {tables}' if tables else '', + '--drop_pg_slot' if drop_pg_slot else '', + ], + ) + ) + ) command = f'{fastsync_bin} {command_args}' @@ -362,9 +392,11 @@ def run_command(command: str, log_file: str = None, line_callback: callable = No # Raise run command exception errors = ''.join(utils.find_errors_in_log_file(log_file_failed)) - raise RunCommandException(f'Command failed. Return code: {proc_rc}\n' - f'Error(s) found:\n{errors}\n' - f'Full log: {log_file_failed}') + raise RunCommandException( + f'Command failed. Return code: {proc_rc}\n' + f'Error(s) found:\n{errors}\n' + f'Full log: {log_file_failed}' + ) # Add success status to the log file name os.rename(log_file_running, log_file_success) diff --git a/pipelinewise/cli/config.py b/pipelinewise/cli/config.py index bd1f3ea0e..0530aa5fb 100644 --- a/pipelinewise/cli/config.py +++ b/pipelinewise/cli/config.py @@ -60,7 +60,9 @@ def from_yamls(cls, config_dir, yaml_dir='.', vault_secret=None): # Load every target yaml into targets dictionary for yaml_file in target_yamls: config.logger.info('LOADING TARGET: %s', yaml_file) - target_data = utils.load_yaml(os.path.join(yaml_dir, yaml_file), vault_secret) + target_data = utils.load_yaml( + os.path.join(yaml_dir, yaml_file), vault_secret + ) utils.validate(instance=target_data, schema=target_schema) # Add generated extra keys that not available in the YAML @@ -72,7 +74,9 @@ def from_yamls(cls, config_dir, yaml_dir='.', vault_secret=None): config.logger.error('Duplicate target found "%s"', target_id) sys.exit(1) - target_data['files'] = config.get_connector_files(config.get_target_dir(target_id)) + target_data['files'] = config.get_connector_files( + config.get_target_dir(target_id) + ) target_data['taps'] = [] # Add target to list @@ -94,19 +98,26 @@ def from_yamls(cls, config_dir, yaml_dir='.', vault_secret=None): target_id = tap_data['target'] if target_id not in targets: - config.logger.error("Can't find the target with the ID \"%s\" but it's referenced in %s", target_id, - yaml_file) + config.logger.error( + "Can't find the target with the ID \"%s\" but it's referenced in %s", + target_id, + yaml_file, + ) sys.exit(1) # Add generated extra keys that not available in the YAML - tap_data['files'] = config.get_connector_files(config.get_tap_dir(target_id, tap_id)) + tap_data['files'] = config.get_connector_files( + config.get_tap_dir(target_id, tap_id) + ) # Add tap to list taps[tap_id] = tap_data # Link taps to targets for target_key, target in targets.items(): - target['taps'] = [tap for tap in taps.values() if tap['target'] == target_key] + target['taps'] = [ + tap for tap in taps.values() if tap['target'] == target_key + ] # Final structure is ready config.targets = targets @@ -138,12 +149,14 @@ def get_connector_files(connector_dir: str) -> Dict: """ return { 'config': os.path.join(connector_dir, 'config.json'), - 'inheritable_config': os.path.join(connector_dir, 'inheritable_config.json'), + 'inheritable_config': os.path.join( + connector_dir, 'inheritable_config.json' + ), 'properties': os.path.join(connector_dir, 'properties.json'), 'state': os.path.join(connector_dir, 'state.json'), 'transformation': os.path.join(connector_dir, 'transformation.json'), 'selection': os.path.join(connector_dir, 'selection.json'), - 'pidfile': os.path.join(connector_dir, 'pipelinewise.pid') + 'pidfile': os.path.join(connector_dir, 'pipelinewise.pid'), } def save(self): @@ -163,7 +176,9 @@ def save(self): # Save every tap JSON files for tap in target['taps']: - extra_config_keys = utils.get_tap_extra_config_keys(tap, self.get_temp_dir()) + extra_config_keys = utils.get_tap_extra_config_keys( + tap, self.get_temp_dir() + ) self.save_tap_jsons(target, tap, extra_config_keys) def save_main_config_json(self): @@ -181,23 +196,27 @@ def save_main_config_json(self): target = target_tuple[1] taps = [] for tap in target.get('taps'): - taps.append({ - 'id': tap.get('id'), - 'name': tap.get('name'), - 'type': tap.get('type'), - 'owner': tap.get('owner'), - 'stream_buffer_size': tap.get('stream_buffer_size'), - 'send_alert': tap.get('send_alert', True), - 'enabled': True - }) - - targets.append({ - 'id': target.get('id'), - 'name': target.get('name'), - 'status': 'ready', - 'type': target.get('type'), - 'taps': taps - }) + taps.append( + { + 'id': tap.get('id'), + 'name': tap.get('name'), + 'type': tap.get('type'), + 'owner': tap.get('owner'), + 'stream_buffer_size': tap.get('stream_buffer_size'), + 'send_alert': tap.get('send_alert', True), + 'enabled': True, + } + ) + + targets.append( + { + 'id': target.get('id'), + 'name': target.get('name'), + 'status': 'ready', + 'type': target.get('type'), + 'taps': taps, + } + ) main_config = {**self.global_config, **{'targets': targets}} # Create config dir if not exists @@ -266,14 +285,23 @@ def save_tap_jsons(self, target, tap, extra_config_keys=None): schema_name = schema.get('source_schema') for table in schema.get('tables', []): table_name = table.get('table_name') - replication_method = table.get('replication_method', utils.get_tap_default_replication_method(tap)) - selection.append(utils.delete_empty_keys({ - 'tap_stream_id': utils.get_tap_stream_id(tap, tap_dbname, schema_name, table_name), - 'replication_method': replication_method, - - # Add replication_key only if replication_method is INCREMENTAL - 'replication_key': table.get('replication_key') if replication_method == 'INCREMENTAL' else None - })) + replication_method = table.get( + 'replication_method', utils.get_tap_default_replication_method(tap) + ) + selection.append( + utils.delete_empty_keys( + { + 'tap_stream_id': utils.get_tap_stream_id( + tap, tap_dbname, schema_name, table_name + ), + 'replication_method': replication_method, + # Add replication_key only if replication_method is INCREMENTAL + 'replication_key': table.get('replication_key') + if replication_method == 'INCREMENTAL' + else None, + } + ) + ) tap_selection = {'selection': selection} # Generate tap transformation @@ -283,29 +311,33 @@ def save_tap_jsons(self, target, tap, extra_config_keys=None): for table in schema.get('tables', []): table_name = table.get('table_name') for trans in table.get('transformations', []): - transformations.append({ - 'tap_stream_name': utils.get_tap_stream_name(tap, tap_dbname, schema_name, table_name), - 'field_id': trans['column'], - # Make column name safe by wrapping it in quotes, it's useful when a field_id is a reserved word - # to be used by target snowflake in fastsync - 'safe_field_id': safe_column_name(trans['column']), - 'type': trans['type'], - 'when': trans.get('when') - }) - tap_transformation = { - 'transformations': transformations - } + transformations.append( + { + 'tap_stream_name': utils.get_tap_stream_name( + tap, tap_dbname, schema_name, table_name + ), + 'field_id': trans['column'], + # Make column name safe by wrapping it in quotes, it's useful when a field_id is a reserved + # word to be used by target snowflake in fastsync + 'safe_field_id': safe_column_name(trans['column']), + 'type': trans['type'], + 'when': trans.get('when'), + } + ) + tap_transformation = {'transformations': transformations} # Generate stream to schema mapping schema_mapping = {} for schema in tap.get('schemas', []): source_schema = schema.get('source_schema') target_schema = schema.get('target_schema') - target_schema_select_perms = schema.get('target_schema_select_permissions', []) + target_schema_select_perms = schema.get( + 'target_schema_select_permissions', [] + ) schema_mapping[source_schema] = { 'target_schema': target_schema, - 'target_schema_select_permissions': target_schema_select_perms + 'target_schema_select_permissions': target_schema_select_perms, } # Schema mapping can include list of indices to create. Some target components @@ -322,53 +354,65 @@ def save_tap_jsons(self, target, tap, extra_config_keys=None): schema_mapping[source_schema]['indices'] = indices # Generate tap inheritable_config dict - tap_inheritable_config = utils.delete_empty_keys({ - 'temp_dir': self.get_temp_dir(), - 'tap_id': tap.get('id'), - 'query_tag': json.dumps({ - 'ppw_component': tap.get('type'), + tap_inheritable_config = utils.delete_empty_keys( + { + 'temp_dir': self.get_temp_dir(), 'tap_id': tap.get('id'), - 'database': '{{database}}', - 'schema': '{{schema}}', - 'table': '{{table}}' - }), - 'batch_size_rows': tap.get('batch_size_rows', 20000), - 'batch_wait_limit_seconds': tap.get('batch_wait_limit_seconds', None), - 'parallelism': tap.get('parallelism', 0), - 'parallelism_max': tap.get('parallelism_max', 4), - 'hard_delete': tap.get('hard_delete', True), - 'flush_all_streams': tap.get('flush_all_streams', False), - 'primary_key_required': tap.get('primary_key_required', True), - 'default_target_schema': tap.get('default_target_schema'), - 'default_target_schema_select_permissions': tap.get('default_target_schema_select_permissions'), - 'schema_mapping': schema_mapping, - - # data_flattening_max_level - # ------------------------- - # - # 'data_flattening_max_level' is an optional parameter in some target connectors that specifies - # how to load nested object into destination. - # - # We can load the original object represented as JSON or string (data flattening off) or we can - # flatten the schema and data by creating columns automatically. When 'data_flattening_max_level' - # is set to 0 then flattening functionality is turned off. - # - #  The value can be set in mutliple place and evaluated in the following order: - # ------------ - # 1: First we try to find it in the tap YAML - # 2: Second we try to get the tap type specific default value - # 3: Otherwise we set flattening level to 0 (disabled) - 'data_flattening_max_level': tap.get('data_flattening_max_level', - utils.get_tap_property(tap, 'default_data_flattening_max_level') or 0), - 'validate_records': tap.get('validate_records', False), - 'add_metadata_columns': tap.get('add_metadata_columns', False), - 'split_large_files': tap.get('split_large_files', False), - 'split_file_chunk_size_mb': tap.get('split_file_chunk_size_mb', 1000), - 'split_file_max_chunks': tap.get('split_file_max_chunks', 20), - 'archive_load_files': tap.get('archive_load_files', False), - 'archive_load_files_s3_bucket': tap.get('archive_load_files_s3_bucket', None), - 'archive_load_files_s3_prefix': tap.get('archive_load_files_s3_prefix', None) - }) + 'query_tag': json.dumps( + { + 'ppw_component': tap.get('type'), + 'tap_id': tap.get('id'), + 'database': '{{database}}', + 'schema': '{{schema}}', + 'table': '{{table}}', + } + ), + 'batch_size_rows': tap.get('batch_size_rows', 20000), + 'batch_wait_limit_seconds': tap.get('batch_wait_limit_seconds', None), + 'parallelism': tap.get('parallelism', 0), + 'parallelism_max': tap.get('parallelism_max', 4), + 'hard_delete': tap.get('hard_delete', True), + 'flush_all_streams': tap.get('flush_all_streams', False), + 'primary_key_required': tap.get('primary_key_required', True), + 'default_target_schema': tap.get('default_target_schema'), + 'default_target_schema_select_permissions': tap.get( + 'default_target_schema_select_permissions' + ), + 'schema_mapping': schema_mapping, + # data_flattening_max_level + # ------------------------- + # + # 'data_flattening_max_level' is an optional parameter in some target connectors that specifies + # how to load nested object into destination. + # + # We can load the original object represented as JSON or string (data flattening off) or we can + # flatten the schema and data by creating columns automatically. When 'data_flattening_max_level' + # is set to 0 then flattening functionality is turned off. + # + #  The value can be set in mutliple place and evaluated in the following order: + # ------------ + # 1: First we try to find it in the tap YAML + # 2: Second we try to get the tap type specific default value + # 3: Otherwise we set flattening level to 0 (disabled) + 'data_flattening_max_level': tap.get( + 'data_flattening_max_level', + utils.get_tap_property(tap, 'default_data_flattening_max_level') + or 0, + ), + 'validate_records': tap.get('validate_records', False), + 'add_metadata_columns': tap.get('add_metadata_columns', False), + 'split_large_files': tap.get('split_large_files', False), + 'split_file_chunk_size_mb': tap.get('split_file_chunk_size_mb', 1000), + 'split_file_max_chunks': tap.get('split_file_max_chunks', 20), + 'archive_load_files': tap.get('archive_load_files', False), + 'archive_load_files_s3_bucket': tap.get( + 'archive_load_files_s3_bucket', None + ), + 'archive_load_files_s3_prefix': tap.get( + 'archive_load_files_s3_prefix', None + ), + } + ) # Save the generated JSON files utils.save_json(tap_config, tap_config_path) diff --git a/pipelinewise/cli/constants.py b/pipelinewise/cli/constants.py index b676c81ec..6593d31d2 100644 --- a/pipelinewise/cli/constants.py +++ b/pipelinewise/cli/constants.py @@ -6,6 +6,7 @@ class ConnectorType(enum.Enum): Enums for various Singer connector type names Todo: add more """ + TAP_MYSQL = 'tap-mysql' TAP_POSTGRES = 'tap-postgres' TAP_MONGODB = 'tap-mongodb' diff --git a/pipelinewise/cli/errors.py b/pipelinewise/cli/errors.py index 88a93d623..e9a0f4872 100644 --- a/pipelinewise/cli/errors.py +++ b/pipelinewise/cli/errors.py @@ -10,6 +10,8 @@ class StreamBufferTooLargeException(Exception): """Raised if stream buffer size is greater than the max allowed size""" def __init__(self, buffer_size, max_buffer_size): - msg = f'{buffer_size}M buffer size is too large. The maximum allowed stream buffer size is ' \ - f'{max_buffer_size}M' + msg = ( + f'{buffer_size}M buffer size is too large. The maximum allowed stream buffer size is ' + f'{max_buffer_size}M' + ) super().__init__(msg) diff --git a/pipelinewise/cli/pipelinewise.py b/pipelinewise/cli/pipelinewise.py index 6ffdcbbfb..0216efa7a 100644 --- a/pipelinewise/cli/pipelinewise.py +++ b/pipelinewise/cli/pipelinewise.py @@ -30,24 +30,24 @@ ConnectorType.TARGET_SNOWFLAKE, ConnectorType.TARGET_REDSHIFT, ConnectorType.TARGET_POSTGRES, - ConnectorType.TARGET_BIGQUERY + ConnectorType.TARGET_BIGQUERY, }, ConnectorType.TAP_POSTGRES: { ConnectorType.TARGET_SNOWFLAKE, ConnectorType.TARGET_REDSHIFT, ConnectorType.TARGET_POSTGRES, - ConnectorType.TARGET_BIGQUERY + ConnectorType.TARGET_BIGQUERY, }, ConnectorType.TAP_S3_CSV: { ConnectorType.TARGET_SNOWFLAKE, ConnectorType.TARGET_REDSHIFT, ConnectorType.TARGET_POSTGRES, - ConnectorType.TARGET_BIGQUERY + ConnectorType.TARGET_BIGQUERY, }, ConnectorType.TAP_MONGODB: { ConnectorType.TARGET_SNOWFLAKE, ConnectorType.TARGET_POSTGRES, - ConnectorType.TARGET_BIGQUERY + ConnectorType.TARGET_BIGQUERY, }, } @@ -73,7 +73,9 @@ def __init__(self, args, config_dir, venv_dir, profiling_dir=None): self.config_dir = config_dir self.venv_dir = venv_dir self.extra_log = args.extra_log - self.pipelinewise_bin = os.path.join(self.venv_dir, 'cli', 'bin', 'pipelinewise') + self.pipelinewise_bin = os.path.join( + self.venv_dir, 'cli', 'bin', 'pipelinewise' + ) self.config_path = os.path.join(self.config_dir, 'config.json') self.load_config() self.alert_sender = AlertSender(self.config.get('alert_handlers')) @@ -88,18 +90,21 @@ def __init__(self, args, config_dir, venv_dir, profiling_dir=None): self.target_bin = self.get_connector_bin(self.target['type']) self.target_python_bin = self.get_connector_python_bin(self.target['type']) - self.transform_field_bin = self.get_connector_bin(self.TRANSFORM_FIELD_CONNECTOR_NAME) - self.transform_field_python_bin = self.get_connector_python_bin(self.TRANSFORM_FIELD_CONNECTOR_NAME) + self.transform_field_bin = self.get_connector_bin( + self.TRANSFORM_FIELD_CONNECTOR_NAME + ) + self.transform_field_python_bin = self.get_connector_python_bin( + self.TRANSFORM_FIELD_CONNECTOR_NAME + ) self.tap_run_log_file = None # Catch SIGINT and SIGTERM to exit gracefully for sig in [signal.SIGINT, signal.SIGTERM]: signal.signal(sig, self._exit_gracefully) - def send_alert(self, - message: str, - level: str = BaseAlertHandler.ERROR, - exc: Exception = None) -> dict: + def send_alert( + self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception = None + ) -> dict: """ Send alert messages to every alert handler if sender is not disabled for the tap @@ -115,7 +120,9 @@ def send_alert(self, send_alert = self.tap.get('send_alert', True) if send_alert: - stats = self.alert_sender.send_to_all_handlers(message=message, level=level, exc=exc) + stats = self.alert_sender.send_to_all_handlers( + message=message, level=level, exc=exc + ) return stats @@ -132,23 +139,27 @@ def create_consumable_target_config(self, target_config, tap_inheritable_config) dict_a.update(dict_b) # Save the new dict as JSON into a temp file - tempfile_path = utils.create_temp_file(dir=self.get_temp_dir(), - prefix='target_config_', - suffix='.json')[1] + tempfile_path = utils.create_temp_file( + dir=self.get_temp_dir(), prefix='target_config_', suffix='.json' + )[1] utils.save_json(dict_a, tempfile_path) return tempfile_path except Exception as exc: - raise Exception(f'Cannot merge JSON files {dict_a} {dict_b} - {exc}') from exc + raise Exception( + f'Cannot merge JSON files {dict_a} {dict_b} - {exc}' + ) from exc # pylint: disable=too-many-statements,too-many-branches,too-many-nested-blocks,too-many-locals,too-many-arguments - def create_filtered_tap_properties(self, - target_type: ConnectorType, - tap_type: ConnectorType, - tap_properties: str, - tap_state: str, - filters: Dict[str, Any], - create_fallback=False): + def create_filtered_tap_properties( + self, + target_type: ConnectorType, + tap_type: ConnectorType, + tap_properties: str, + tap_state: str, + filters: Dict[str, Any], + create_fallback=False, + ): """ Create a filtered version of tap properties file based on specific filter conditions. @@ -183,7 +194,9 @@ def create_filtered_tap_properties(self, fallback_properties = copy.deepcopy(properties) if create_fallback else {} # Foreach stream (table) in the original properties - for stream_idx, stream in enumerate(properties.get('streams', tap_properties)): + for stream_idx, stream in enumerate( + properties.get('streams', tap_properties) + ): initial_sync_required = False # Collect required properties from the properties file @@ -202,7 +215,11 @@ def create_filtered_tap_properties(self, # Can we make sure that the stream has the right metadata? # To be safe, check if no right metadata has been found, then throw an exception. if not table_meta: - self.logger.error('Stream %s has no metadata with no breadcrumbs: %s.', tap_stream_id, metadata) + self.logger.error( + 'Stream %s has no metadata with no breadcrumbs: %s.', + tap_stream_id, + metadata, + ) raise Exception(f'Missing metadata in stream {tap_stream_id}') selected = table_meta.get('selected', False) @@ -211,7 +228,9 @@ def create_filtered_tap_properties(self, # Detect if initial sync is required. Look into the state file, get the bookmark # for the current stream (table) and if valid bookmark doesn't exist then # initial sync is required - bookmarks = state.get('bookmarks', {}) if isinstance(state, dict) else {} + bookmarks = ( + state.get('bookmarks', {}) if isinstance(state, dict) else {} + ) new_stream = False @@ -222,7 +241,9 @@ def create_filtered_tap_properties(self, else: stream_bookmark = bookmarks[tap_stream_id] - if self._is_initial_sync_required(replication_method, stream_bookmark): + if self._is_initial_sync_required( + replication_method, stream_bookmark + ): initial_sync_required = True # Compare actual values to the filter conditions. @@ -230,21 +251,39 @@ def create_filtered_tap_properties(self, # Set the "selected" key to False if the actual values don't meet the filter criteria # pylint: disable=too-many-boolean-expressions if ( - (f_selected is None or selected == f_selected) and - (f_tap_target_pairs is None or target_type in f_tap_target_pairs.get(tap_type, set())) and - (f_replication_method is None or replication_method in f_replication_method) and - (f_initial_sync_required is None or initial_sync_required == f_initial_sync_required) + (f_selected is None or selected == f_selected) + and ( + f_tap_target_pairs is None + or target_type in f_tap_target_pairs.get(tap_type, set()) + ) + and ( + f_replication_method is None + or replication_method in f_replication_method + ) + and ( + f_initial_sync_required is None + or initial_sync_required == f_initial_sync_required + ) ): - self.logger.debug("""Filter condition(s) matched: + self.logger.debug( + """Filter condition(s) matched: Table : %s Tap Stream ID : %s Selected : %s Replication Method : %s Init Sync Required : %s - """, table_name, tap_stream_id, selected, replication_method, initial_sync_required) + """, + table_name, + tap_stream_id, + selected, + replication_method, + initial_sync_required, + ) # Filter condition matched: mark table as selected to sync - properties['streams'][stream_idx]['metadata'][meta_idx]['metadata']['selected'] = True + properties['streams'][stream_idx]['metadata'][meta_idx]['metadata'][ + 'selected' + ] = True filtered_tap_stream_ids.append(tap_stream_id) # Filter condition matched: @@ -252,47 +291,58 @@ def create_filtered_tap_properties(self, # the fallback properties as well if the table is selected in the original properties. # Otherwise, mark it as not selected if create_fallback: - if new_stream and replication_method in [self.INCREMENTAL, self.LOG_BASED]: - fallback_properties['streams'][stream_idx]['metadata'][meta_idx]['metadata'][ - 'selected'] = True + if new_stream and replication_method in [ + self.INCREMENTAL, + self.LOG_BASED, + ]: + fallback_properties['streams'][stream_idx]['metadata'][ + meta_idx + ]['metadata']['selected'] = True if selected: fallback_filtered_stream_ids.append(tap_stream_id) else: - fallback_properties['streams'][stream_idx]['metadata'][meta_idx]['metadata'][ - 'selected'] = False + fallback_properties['streams'][stream_idx]['metadata'][ + meta_idx + ]['metadata']['selected'] = False else: # Filter condition didn't match: mark table as not selected to sync - properties['streams'][stream_idx]['metadata'][meta_idx]['metadata']['selected'] = False + properties['streams'][stream_idx]['metadata'][meta_idx]['metadata'][ + 'selected' + ] = False # Filter condition didn't match: mark table as selected to sync in the fallback properties # Fallback only if the table is selected in the original properties if create_fallback and selected is True: - fallback_properties['streams'][stream_idx]['metadata'][meta_idx]['metadata']['selected'] = True + fallback_properties['streams'][stream_idx]['metadata'][ + meta_idx + ]['metadata']['selected'] = True fallback_filtered_stream_ids.append(tap_stream_id) # Save the generated properties file(s) and return # Fallback required: Save filtered and fallback properties JSON if create_fallback: # Save to files: filtered and fallback properties - temp_properties_path = utils.create_temp_file(dir=self.get_temp_dir(), - prefix='properties_', - suffix='.json')[1] + temp_properties_path = utils.create_temp_file( + dir=self.get_temp_dir(), prefix='properties_', suffix='.json' + )[1] utils.save_json(properties, temp_properties_path) - temp_fallback_properties_path = utils.create_temp_file(dir=self.get_temp_dir(), - prefix='properties_', - suffix='.json')[1] + temp_fallback_properties_path = utils.create_temp_file( + dir=self.get_temp_dir(), prefix='properties_', suffix='.json' + )[1] utils.save_json(fallback_properties, temp_fallback_properties_path) - return (temp_properties_path, - filtered_tap_stream_ids, - temp_fallback_properties_path, - fallback_filtered_stream_ids) + return ( + temp_properties_path, + filtered_tap_stream_ids, + temp_fallback_properties_path, + fallback_filtered_stream_ids, + ) # Fallback not required: Save only the filtered properties JSON - temp_properties_path = utils.create_temp_file(dir=self.get_temp_dir(), - prefix='properties_', - suffix='.json')[1] + temp_properties_path = utils.create_temp_file( + dir=self.get_temp_dir(), prefix='properties_', suffix='.json' + )[1] utils.save_json(properties, temp_properties_path) return temp_properties_path, filtered_tap_stream_ids @@ -424,8 +474,9 @@ def get_tap(self, target_id: str, tap_id: str) -> Dict: return tap + # TODO: This method is too complex! make its complexity less than 15! # pylint: disable=too-many-branches,too-many-statements,too-many-nested-blocks,too-many-locals - def merge_schemas(self, old_schema, new_schema): + def merge_schemas(self, old_schema, new_schema): # noqa: C901 """ Merge two schemas """ @@ -439,7 +490,14 @@ def merge_schemas(self, old_schema, new_schema): for new_stream_idx, new_stream in enumerate(new_streams): new_tap_stream_id = new_stream['tap_stream_id'] - old_stream = next((item for item in old_streams if item['tap_stream_id'] == new_tap_stream_id), None) + old_stream = next( + ( + item + for item in old_streams + if item['tap_stream_id'] == new_tap_stream_id + ), + None, + ) # Is this a new stream? if not old_stream: @@ -451,54 +509,87 @@ def merge_schemas(self, old_schema, new_schema): new_stream_table_mdata_idx = 0 old_stream_table_mdata_idx = 0 try: - new_stream_table_mdata_idx = \ - [i for i, md in enumerate(new_stream['metadata']) if md['breadcrumb'] == []][0] - old_stream_table_mdata_idx = \ - [i for i, md in enumerate(old_stream['metadata']) if md['breadcrumb'] == []][0] + new_stream_table_mdata_idx = [ + i + for i, md in enumerate(new_stream['metadata']) + if md['breadcrumb'] == [] + ][0] + old_stream_table_mdata_idx = [ + i + for i, md in enumerate(old_stream['metadata']) + if md['breadcrumb'] == [] + ][0] except Exception: pass # Copy is-new flag from the old stream try: - new_schema['streams'][new_stream_idx]['is-new'] = old_stream['is-new'] + new_schema['streams'][new_stream_idx]['is-new'] = old_stream[ + 'is-new' + ] except Exception: pass # Copy selected from the old stream try: - new_schema['streams'][new_stream_idx]['metadata'][new_stream_table_mdata_idx]['metadata'][ - 'selected'] = old_stream['metadata'][old_stream_table_mdata_idx]['metadata']['selected'] + new_schema['streams'][new_stream_idx]['metadata'][ + new_stream_table_mdata_idx + ]['metadata']['selected'] = old_stream['metadata'][ + old_stream_table_mdata_idx + ][ + 'metadata' + ][ + 'selected' + ] except Exception: pass # Copy replication method from the old stream try: - new_schema['streams'][new_stream_idx]['metadata'] \ - [new_stream_table_mdata_idx]['metadata']['replication-method'] = \ - old_stream['metadata'][old_stream_table_mdata_idx]['metadata']['replication-method'] + new_schema['streams'][new_stream_idx]['metadata'][ + new_stream_table_mdata_idx + ]['metadata']['replication-method'] = old_stream['metadata'][ + old_stream_table_mdata_idx + ][ + 'metadata' + ][ + 'replication-method' + ] except Exception: pass # Copy replication key from the old stream try: - new_schema['streams'][new_stream_idx]['metadata'][new_stream_table_mdata_idx] \ - ['metadata']['replication-key'] = \ - old_stream['metadata'][old_stream_table_mdata_idx]['metadata'][ - 'replication-key'] + new_schema['streams'][new_stream_idx]['metadata'][ + new_stream_table_mdata_idx + ]['metadata']['replication-key'] = old_stream['metadata'][ + old_stream_table_mdata_idx + ][ + 'metadata' + ][ + 'replication-key' + ] except Exception: pass # Is this new or modified field? - new_fields = new_schema['streams'][new_stream_idx]['schema']['properties'] + new_fields = new_schema['streams'][new_stream_idx]['schema'][ + 'properties' + ] old_fields = old_stream['schema']['properties'] for new_field_key in new_fields: new_field = new_fields[new_field_key] new_field_mdata_idx = -1 # Find new field metadata index - for i, mdata in enumerate(new_schema['streams'][new_stream_idx]['metadata']): - if len(mdata['breadcrumb']) == 2 and mdata['breadcrumb'][0] == 'properties' and \ - mdata['breadcrumb'][1] == new_field_key: + for i, mdata in enumerate( + new_schema['streams'][new_stream_idx]['metadata'] + ): + if ( + len(mdata['breadcrumb']) == 2 + and mdata['breadcrumb'][0] == 'properties' + and mdata['breadcrumb'][1] == new_field_key + ): new_field_mdata_idx = i # Field exists @@ -508,13 +599,19 @@ def merge_schemas(self, old_schema, new_schema): # Find old field metadata index for i, mdata in enumerate(old_stream['metadata']): - if len(mdata['breadcrumb']) == 2 and mdata['breadcrumb'][0] == 'properties' and \ - mdata['breadcrumb'][1] == new_field_key: + if ( + len(mdata['breadcrumb']) == 2 + and mdata['breadcrumb'][0] == 'properties' + and mdata['breadcrumb'][1] == new_field_key + ): old_field_mdata_idx = i - new_mdata = new_schema['streams'][new_stream_idx]['metadata'][new_field_mdata_idx][ - 'metadata'] - old_mdata = old_stream['metadata'][old_field_mdata_idx]['metadata'] + new_mdata = new_schema['streams'][new_stream_idx][ + 'metadata' + ][new_field_mdata_idx]['metadata'] + old_mdata = old_stream['metadata'][old_field_mdata_idx][ + 'metadata' + ] # Copy is-new flag from the old properties try: @@ -536,28 +633,43 @@ def merge_schemas(self, old_schema, new_schema): # Field exists and type is the same - Do nothing more in the schema if new_field == old_field: - self.logger.debug('Field exists in %s stream with the same type: %s: %s', - new_tap_stream_id, new_field_key, new_field) + self.logger.debug( + 'Field exists in %s stream with the same type: %s: %s', + new_tap_stream_id, + new_field_key, + new_field, + ) # Field exists but types are different - Mark the field as modified in the metadata else: - self.logger.debug('Field exists in %s stream but types are different: %s: %s}', - new_tap_stream_id, new_field_key, new_field) + self.logger.debug( + 'Field exists in %s stream but types are different: %s: %s}', + new_tap_stream_id, + new_field_key, + new_field, + ) try: - new_schema['streams'][new_stream_idx]['metadata'][new_field_mdata_idx]['metadata'][ - 'is-modified'] = True - new_schema['streams'][new_stream_idx]['metadata'][new_field_mdata_idx]['metadata'][ - 'is-new'] = False + new_schema['streams'][new_stream_idx]['metadata'][ + new_field_mdata_idx + ]['metadata']['is-modified'] = True + new_schema['streams'][new_stream_idx]['metadata'][ + new_field_mdata_idx + ]['metadata']['is-new'] = False except Exception: pass # New field - Mark the field as new in the metadata else: - self.logger.debug('New field in stream %s: %s: %s', new_tap_stream_id, new_field_key, - new_field) + self.logger.debug( + 'New field in stream %s: %s: %s', + new_tap_stream_id, + new_field_key, + new_field, + ) try: - new_schema['streams'][new_stream_idx]['metadata'][new_field_mdata_idx]['metadata'][ - 'is-new'] = True + new_schema['streams'][new_stream_idx]['metadata'][ + new_field_mdata_idx + ]['metadata']['is-new'] = True except Exception: pass @@ -579,28 +691,52 @@ def make_default_selection(self, schema, selection_file): tap_stream_id = stream.get('tap_stream_id') tap_stream_sel = None for sel in selection: - if 'tap_stream_id' in sel and tap_stream_id.lower() == sel['tap_stream_id'].lower(): + if ( + 'tap_stream_id' in sel + and tap_stream_id.lower() == sel['tap_stream_id'].lower() + ): tap_stream_sel = sel # Find table specific metadata entries in the old and new streams try: - stream_table_mdata_idx = [i for i, md in enumerate(stream['metadata']) if md['breadcrumb'] == []][0] + stream_table_mdata_idx = [ + i + for i, md in enumerate(stream['metadata']) + if md['breadcrumb'] == [] + ][0] except Exception as exc: - raise Exception(f'Metadata of stream {tap_stream_id} doesn\'t have an empty breadcrumb') from exc + raise Exception( + f'Metadata of stream {tap_stream_id} doesn\'t have an empty breadcrumb' + ) from exc if tap_stream_sel: - self.logger.debug('Mark %s tap_stream_id as selected with properties %s', tap_stream_id, - tap_stream_sel) - schema['streams'][stream_idx]['metadata'][stream_table_mdata_idx]['metadata']['selected'] = True + self.logger.debug( + 'Mark %s tap_stream_id as selected with properties %s', + tap_stream_id, + tap_stream_sel, + ) + schema['streams'][stream_idx]['metadata'][stream_table_mdata_idx][ + 'metadata' + ]['selected'] = True if 'replication_method' in tap_stream_sel: - schema['streams'][stream_idx]['metadata'][stream_table_mdata_idx]['metadata'][ - 'replication-method'] = tap_stream_sel['replication_method'] + schema['streams'][stream_idx]['metadata'][ + stream_table_mdata_idx + ]['metadata']['replication-method'] = tap_stream_sel[ + 'replication_method' + ] if 'replication_key' in tap_stream_sel: - schema['streams'][stream_idx]['metadata'][stream_table_mdata_idx]['metadata'][ - 'replication-key'] = tap_stream_sel['replication_key'] + schema['streams'][stream_idx]['metadata'][ + stream_table_mdata_idx + ]['metadata']['replication-key'] = tap_stream_sel[ + 'replication_key' + ] else: - self.logger.debug('Mark %s tap_stream_id as not selected', tap_stream_id) - schema['streams'][stream_idx]['metadata'][stream_table_mdata_idx]['metadata']['selected'] = False + self.logger.debug( + 'Mark %s tap_stream_id as not selected', tap_stream_id + ) + schema['streams'][stream_idx]['metadata'][stream_table_mdata_idx][ + 'metadata' + ]['selected'] = False return schema @@ -614,7 +750,9 @@ def init(self): # Create project dir if not exists if os.path.exists(project_dir): - self.logger.error('Directory exists and cannot create new project: %s', self.args.name) + self.logger.error( + 'Directory exists and cannot create new project: %s', self.args.name + ) sys.exit(1) else: os.mkdir(project_dir) @@ -636,7 +774,13 @@ def test_tap_connection(self): target_id = self.target['id'] target_type = self.target['type'] - self.logger.info('Testing %s (%s) tap connection in %s (%s) target', tap_id, tap_type, target_id, target_type) + self.logger.info( + 'Testing %s (%s) tap connection in %s (%s) target', + tap_id, + tap_type, + target_id, + target_type, + ) # Generate and run the command to run the tap directly # We will use the discover option to test connection @@ -654,15 +798,21 @@ def test_tap_connection(self): returncode, new_schema, tap_output = result if returncode != 0: - self.logger.error('Testing tap connection (%s - %s) FAILED', target_id, tap_id) + self.logger.error( + 'Testing tap connection (%s - %s) FAILED', target_id, tap_id + ) sys.exit(1) # If the connection success then the response needs to be a valid JSON string if not utils.is_json(new_schema): - self.logger.error('Schema discovered by %s (%s) is not a valid JSON.', tap_id, tap_type) + self.logger.error( + 'Schema discovered by %s (%s) is not a valid JSON.', tap_id, tap_type + ) sys.exit(1) else: - self.logger.info('Testing tap connection (%s - %s) PASSED', target_id, tap_id) + self.logger.info( + 'Testing tap connection (%s - %s) PASSED', target_id, tap_id + ) # pylint: disable=too-many-locals,inconsistent-return-statements def discover_tap(self, tap=None, target=None): @@ -688,7 +838,13 @@ def discover_tap(self, tap=None, target=None): target_id = target.get('id') target_type = target.get('type') - self.logger.info('Discovering %s (%s) tap in %s (%s) target...', tap_id, tap_type, target_id, target_type) + self.logger.info( + 'Discovering %s (%s) tap in %s (%s) target...', + tap_id, + tap_type, + target_id, + target_type, + ) # Generate and run the command to run the tap directly command = f'{tap_bin} --config {tap_config_file} --discover' @@ -724,25 +880,33 @@ def discover_tap(self, tap=None, target=None): # Make selection from selection.json if exists try: - schema_with_diff = self.make_default_selection(schema_with_diff, tap_selection_file) + schema_with_diff = self.make_default_selection( + schema_with_diff, tap_selection_file + ) schema_with_diff = utils.delete_keys_from_dict( self.make_default_selection(schema_with_diff, tap_selection_file), - # Removing multipleOf json schema validations from properties.json, # that's causing run time issues - ['multipleOf']) + ['multipleOf'], + ) except Exception as exc: return f'Cannot load selection JSON at {tap_selection_file}. {str(exc)}' # Post import checks - post_import_errors = self._run_post_import_tap_checks(tap, schema_with_diff, target_id) + post_import_errors = self._run_post_import_tap_checks( + tap, schema_with_diff, target_id + ) if len(post_import_errors) > 0: - return f'Post import tap checks failed in tap {tap_id}: {post_import_errors}' + return ( + f'Post import tap checks failed in tap {tap_id}: {post_import_errors}' + ) # Save the new catalog into the tap try: - self.logger.info('Writing new properties file with changes into %s', tap_properties_file) + self.logger.info( + 'Writing new properties file with changes into %s', tap_properties_file + ) utils.save_json(schema_with_diff, tap_properties_file) except Exception as exc: return f'Cannot save file. {str(exc)}' @@ -758,7 +922,7 @@ def detect_tap_status(self, target_id, tap_id): status = { 'currentStatus': 'unknown', 'lastStatus': 'unknown', - 'lastTimestamp': None + 'lastTimestamp': None, } # Tap exists but configuration not completed @@ -766,7 +930,10 @@ def detect_tap_status(self, target_id, tap_id): status['currentStatus'] = 'not-configured' # Tap exists and has log in running status - elif os.path.isdir(log_dir) and len(utils.search_files(log_dir, patterns=['*.log.running'])) > 0: + elif ( + os.path.isdir(log_dir) + and len(utils.search_files(log_dir, patterns=['*.log.running'])) > 0 + ): status['currentStatus'] = 'running' # Configured and not running @@ -775,7 +942,9 @@ def detect_tap_status(self, target_id, tap_id): # Get last run instance if os.path.isdir(log_dir): - log_files = utils.search_files(log_dir, patterns=['*.log.success', '*.log.failed'], sort=True) + log_files = utils.search_files( + log_dir, patterns=['*.log.success', '*.log.failed'], sort=True + ) if len(log_files) > 0: last_log_file = log_files[0] log_attr = utils.extract_log_attributes(last_log_file) @@ -798,7 +967,7 @@ def status(self): 'Enabled', 'Status', 'Last Sync', - 'Last Sync Result' + 'Last Sync Result', ] tab_body = [] pipelines = 0 @@ -806,44 +975,55 @@ def status(self): taps = self.get_taps(target['id']) for tap in taps: - tab_body.append([ - tap.get('id', ''), - tap.get('type', ''), - target.get('id', ''), - target.get('type', ''), - tap.get('enabled', ''), - tap.get('status', {}).get('currentStatus', ''), - tap.get('status', {}).get('lastTimestamp', ''), - tap.get('status', {}).get('lastStatus', '') - ]) + tab_body.append( + [ + tap.get('id', ''), + tap.get('type', ''), + target.get('id', ''), + target.get('type', ''), + tap.get('enabled', ''), + tap.get('status', {}).get('currentStatus', ''), + tap.get('status', {}).get('lastTimestamp', ''), + tap.get('status', {}).get('lastStatus', ''), + ] + ) pipelines += 1 print(tabulate(tab_body, headers=tab_headers, tablefmt='simple')) print(f'{pipelines} pipeline(s)') - def run_tap_singer(self, - tap: TapParams, - target: TargetParams, - transform: TransformParams, - stream_buffer_size: int = 0) -> str: + def run_tap_singer( + self, + tap: TapParams, + target: TargetParams, + transform: TransformParams, + stream_buffer_size: int = 0, + ) -> str: """ Generate and run piped shell command to sync tables using singer taps and targets """ # Build the piped executable command - command = commands.build_singer_command(tap=tap, - target=target, - transform=transform, - stream_buffer_size=stream_buffer_size, - stream_buffer_log_file=self.tap_run_log_file, - profiling_mode=self.profiling_mode, - profiling_dir=self.profiling_dir) + command = commands.build_singer_command( + tap=tap, + target=target, + transform=transform, + stream_buffer_size=stream_buffer_size, + stream_buffer_log_file=self.tap_run_log_file, + profiling_mode=self.profiling_mode, + profiling_dir=self.profiling_dir, + ) # Do not run if another instance is already running log_dir = os.path.dirname(self.tap_run_log_file) - if os.path.isdir(log_dir) and len(utils.search_files(log_dir, patterns=['*.log.running'])) > 0: + if ( + os.path.isdir(log_dir) + and len(utils.search_files(log_dir, patterns=['*.log.running'])) > 0 + ): self.logger.info( 'Failed to run. Another instance of the same tap is already running. ' - 'Log file detected in running status at %s', log_dir) + 'Log file detected in running status at %s', + log_dir, + ) sys.exit(1) start = None @@ -879,7 +1059,9 @@ def update_state_file_with_extra_log(line: str) -> str: # Run command with update_state_file as a callback to call for every stdout line if self.extra_log: - commands.run_command(command, self.tap_run_log_file, update_state_file_with_extra_log) + commands.run_command( + command, self.tap_run_log_file, update_state_file_with_extra_log + ) else: commands.run_command(command, self.tap_run_log_file, update_state_file) @@ -888,27 +1070,36 @@ def update_state_file_with_extra_log(line: str) -> str: with open(tap.state, 'w', encoding='utf-8') as statefile: statefile.write(state) - def run_tap_fastsync(self, tap: TapParams, target: TargetParams, transform: TransformParams): + def run_tap_fastsync( + self, tap: TapParams, target: TargetParams, transform: TransformParams + ): """ Generating and running shell command to sync tables using the native fastsync components """ # Build the fastsync executable command - command = commands.build_fastsync_command(tap=tap, - target=target, - transform=transform, - venv_dir=self.venv_dir, - temp_dir=self.get_temp_dir(), - tables=self.args.tables, - profiling_mode=self.profiling_mode, - profiling_dir=self.profiling_dir, - drop_pg_slot=self.drop_pg_slot) + command = commands.build_fastsync_command( + tap=tap, + target=target, + transform=transform, + venv_dir=self.venv_dir, + temp_dir=self.get_temp_dir(), + tables=self.args.tables, + profiling_mode=self.profiling_mode, + profiling_dir=self.profiling_dir, + drop_pg_slot=self.drop_pg_slot, + ) # Do not run if another instance is already running log_dir = os.path.dirname(self.tap_run_log_file) - if os.path.isdir(log_dir) and len(utils.search_files(log_dir, patterns=['*.log.running'])) > 0: + if ( + os.path.isdir(log_dir) + and len(utils.search_files(log_dir, patterns=['*.log.running'])) > 0 + ): self.logger.info( 'Failed to run. Another instance of the same tap is already running. ' - 'Log file detected in running status at %s', log_dir) + 'Log file detected in running status at %s', + log_dir, + ) sys.exit(1) # Fastsync is running in subprocess. @@ -920,7 +1111,9 @@ def add_fastsync_output_to_main_logger(line: str) -> str: if self.extra_log: # Run command and copy fastsync output to main logger - commands.run_command(command, self.tap_run_log_file, add_fastsync_output_to_main_logger) + commands.run_command( + command, self.tap_run_log_file, add_fastsync_output_to_main_logger + ) else: # Run command commands.run_command(command, self.tap_run_log_file) @@ -949,7 +1142,9 @@ def run_tap(self): tap_type = self.tap['type'] target_id = self.target['id'] target_type = self.target['type'] - stream_buffer_size = self.tap.get('stream_buffer_size', commands.DEFAULT_STREAM_BUFFER_SIZE) + stream_buffer_size = self.tap.get( + 'stream_buffer_size', commands.DEFAULT_STREAM_BUFFER_SIZE + ) self.logger.info('Running %s tap in %s target', tap_id, target_id) @@ -974,7 +1169,9 @@ def run_tap(self): # Some target attributes can be passed and override by tap (aka. inheritable config) # We merge the two configs and use that with the target - cons_target_config = self.create_consumable_target_config(target_config, tap_inheritable_config) + cons_target_config = self.create_consumable_target_config( + target_config, tap_inheritable_config + ) # Output will be redirected into target and tap specific log directory log_dir = self.get_tap_log_dir(target_id, tap_id) @@ -986,7 +1183,7 @@ def run_tap(self): tap_properties_fastsync, fastsync_stream_ids, tap_properties_singer, - singer_stream_ids + singer_stream_ids, ) = self.create_filtered_tap_properties( ConnectorType(target_type), ConnectorType(tap_type), @@ -995,61 +1192,84 @@ def run_tap(self): { 'selected': True, 'tap_target_pairs': FASTSYNC_PAIRS, - 'initial_sync_required': True + 'initial_sync_required': True, }, - create_fallback=True) + create_fallback=True, + ) start_time = datetime.now() try: with pidfile.PIDFile(self.tap['files']['pidfile']): - target_params = TargetParams(id=target_id, - type=target_type, - bin=self.target_bin, - python_bin=self.target_python_bin, - config=cons_target_config) - - transform_params = TransformParams(bin=self.transform_field_bin, - python_bin=self.transform_field_python_bin, - config=tap_transformation, - tap_id=tap_id, - target_id=target_id) + target_params = TargetParams( + id=target_id, + type=target_type, + bin=self.target_bin, + python_bin=self.target_python_bin, + config=cons_target_config, + ) + + transform_params = TransformParams( + bin=self.transform_field_bin, + python_bin=self.transform_field_python_bin, + config=tap_transformation, + tap_id=tap_id, + target_id=target_id, + ) # Run fastsync for FULL_TABLE replication method if len(fastsync_stream_ids) > 0: - self.logger.info('Table(s) selected to sync by fastsync: %s', fastsync_stream_ids) - self.tap_run_log_file = os.path.join(log_dir, f'{target_id}-{tap_id}-{current_time}.fastsync.log') - tap_params = TapParams(id=tap_id, - type=tap_type, - bin=self.tap_bin, - python_bin=self.tap_python_bin, - config=tap_config, - properties=tap_properties_fastsync, - state=tap_state) - - self.run_tap_fastsync(tap=tap_params, - target=target_params, - transform=transform_params) + self.logger.info( + 'Table(s) selected to sync by fastsync: %s', fastsync_stream_ids + ) + self.tap_run_log_file = os.path.join( + log_dir, f'{target_id}-{tap_id}-{current_time}.fastsync.log' + ) + tap_params = TapParams( + id=tap_id, + type=tap_type, + bin=self.tap_bin, + python_bin=self.tap_python_bin, + config=tap_config, + properties=tap_properties_fastsync, + state=tap_state, + ) + + self.run_tap_fastsync( + tap=tap_params, target=target_params, transform=transform_params + ) else: - self.logger.info('No table available that needs to be sync by fastsync') + self.logger.info( + 'No table available that needs to be sync by fastsync' + ) # Run singer tap for INCREMENTAL and LOG_BASED replication methods if len(singer_stream_ids) > 0: - self.logger.info('Table(s) selected to sync by singer: %s', singer_stream_ids) - self.tap_run_log_file = os.path.join(log_dir, f'{target_id}-{tap_id}-{current_time}.singer.log') - tap_params = TapParams(id=tap_id, - type=tap_type, - bin=self.tap_bin, - python_bin=self.tap_python_bin, - config=tap_config, - properties=tap_properties_singer, - state=tap_state) - - self.run_tap_singer(tap=tap_params, - target=target_params, - transform=transform_params, - stream_buffer_size=stream_buffer_size) + self.logger.info( + 'Table(s) selected to sync by singer: %s', singer_stream_ids + ) + self.tap_run_log_file = os.path.join( + log_dir, f'{target_id}-{tap_id}-{current_time}.singer.log' + ) + tap_params = TapParams( + id=tap_id, + type=tap_type, + bin=self.tap_bin, + python_bin=self.tap_python_bin, + config=tap_config, + properties=tap_properties_singer, + state=tap_state, + ) + + self.run_tap_singer( + tap=tap_params, + target=target_params, + transform=transform_params, + stream_buffer_size=stream_buffer_size, + ) else: - self.logger.info('No table available that needs to be sync by singer') + self.logger.info( + 'No table available that needs to be sync by singer' + ) except pidfile.AlreadyRunningError: self.logger.error('Another instance of the tap is already running.') @@ -1102,11 +1322,16 @@ def stop_tap(self): self.logger.info('Sending SIGINT to main pid %s...', parent.pid) parent.send_signal(signal.SIGINT) except ProcessLookupError: - self.logger.error('Pid %s not found. Is the tap running on this machine? ' - 'Stopping taps remotely is not supported.', pid) + self.logger.error( + 'Pid %s not found. Is the tap running on this machine? ' + 'Stopping taps remotely is not supported.', + pid, + ) sys.exit(1) except FileNotFoundError: - self.logger.error('No pidfile found at %s. Tap does not seem to be running.', pidfile_path) + self.logger.error( + 'No pidfile found at %s. Tap does not seem to be running.', pidfile_path + ) sys.exit(1) # pylint: disable=too-many-locals @@ -1125,7 +1350,13 @@ def sync_tables(self): target_type = self.target['type'] fastsync_bin = utils.get_fastsync_bin(self.venv_dir, tap_type, target_type) - self.logger.info('Syncing tables from %s (%s) to %s (%s)...', tap_id, tap_type, target_id, target_type) + self.logger.info( + 'Syncing tables from %s (%s) to %s (%s)...', + tap_id, + tap_type, + target_id, + target_type, + ) # Run only if tap enabled if not self.tap.get('enabled', False): @@ -1135,14 +1366,19 @@ def sync_tables(self): # Run only if tap not running tap_status = self.detect_tap_status(target_id, tap_id) if tap_status['currentStatus'] == 'running': - self.logger.info('Tap %s is currently running and cannot sync. Stop the tap and try again.', - self.tap['name']) + self.logger.info( + 'Tap %s is currently running and cannot sync. Stop the tap and try again.', + self.tap['name'], + ) sys.exit(1) # Tap exists but configuration not completed if not os.path.isfile(fastsync_bin): - self.logger.error('Table sync function is not implemented from %s datasources to %s type of targets', - tap_type, target_type) + self.logger.error( + 'Table sync function is not implemented from %s datasources to %s type of targets', + tap_type, + target_type, + ) sys.exit(1) # Generate and run the command to run the tap directly @@ -1159,7 +1395,9 @@ def sync_tables(self): # Some target attributes can be passed and override by tap (aka. inheritable config) # We merge the two configs and use that with the target - cons_target_config = self.create_consumable_target_config(target_config, tap_inheritable_config) + cons_target_config = self.create_consumable_target_config( + target_config, tap_inheritable_config + ) # Output will be redirected into target and tap specific log directory log_dir = self.get_tap_log_dir(target_id, tap_id) @@ -1168,7 +1406,9 @@ def sync_tables(self): # sync_tables command always using fastsync try: with pidfile.PIDFile(self.tap['files']['pidfile']): - self.tap_run_log_file = os.path.join(log_dir, f'{target_id}-{tap_id}-{current_time}.fastsync.log') + self.tap_run_log_file = os.path.join( + log_dir, f'{target_id}-{tap_id}-{current_time}.fastsync.log' + ) # Create parameters as NamedTuples tap_params = TapParams( @@ -1178,14 +1418,15 @@ def sync_tables(self): python_bin=self.tap_python_bin, config=tap_config, properties=tap_properties, - state=tap_state) + state=tap_state, + ) target_params = TargetParams( id=target_id, type=target_type, bin=self.target_bin, python_bin=self.target_python_bin, - config=cons_target_config + config=cons_target_config, ) transform_params = TransformParams( @@ -1193,12 +1434,12 @@ def sync_tables(self): config=tap_transformation, python_bin=self.transform_field_python_bin, tap_id=tap_id, - target_id=target_id + target_id=target_id, ) - self.run_tap_fastsync(tap=tap_params, - target=target_params, - transform=transform_params) + self.run_tap_fastsync( + tap=tap_params, target=target_params, transform=transform_params + ) except pidfile.AlreadyRunningError: self.logger.error('Another instance of the tap is already running.') @@ -1237,7 +1478,9 @@ def validate(self): # Validate target json schemas and that no duplicate IDs exist for yaml_file in target_yamls: self.logger.info('Started validating %s', yaml_file) - loaded_yaml = utils.load_yaml(os.path.join(yaml_dir, yaml_file), vault_secret) + loaded_yaml = utils.load_yaml( + os.path.join(yaml_dir, yaml_file), vault_secret + ) utils.validate(loaded_yaml, target_schema) if loaded_yaml['id'] in target_ids: @@ -1252,7 +1495,9 @@ def validate(self): # Validate tap json schemas, check that every tap has valid 'target' and that no duplicate IDs exist for yaml_file in tap_yamls: self.logger.info('Started validating %s', yaml_file) - loaded_yaml = utils.load_yaml(os.path.join(yaml_dir, yaml_file), vault_secret) + loaded_yaml = utils.load_yaml( + os.path.join(yaml_dir, yaml_file), vault_secret + ) utils.validate(loaded_yaml, tap_schema) if loaded_yaml['id'] in tap_ids: @@ -1260,8 +1505,12 @@ def validate(self): sys.exit(1) if loaded_yaml['target'] not in target_ids: - self.logger.error("Can'f find the target with the ID '%s' referenced in '%s'. Available target IDs: %s", - loaded_yaml['target'], yaml_file, target_ids) + self.logger.error( + "Can'f find the target with the ID '%s' referenced in '%s'. Available target IDs: %s", + loaded_yaml['target'], + yaml_file, + target_ids, + ) sys.exit(1) tap_ids.add(loaded_yaml['id']) @@ -1302,11 +1551,17 @@ def import_project(self): with parallel_backend('threading', n_jobs=-1): # Discover taps in parallel and return the list of exception of the failed ones - discover_excs.extend(list(filter(None, - Parallel(verbose=100)(delayed(self.discover_tap)( - tap=tap, - target=target - ) for tap in target.get('taps'))))) + discover_excs.extend( + list( + filter( + None, + Parallel(verbose=100)( + delayed(self.discover_tap)(tap=tap, target=target) + for tap in target.get('taps') + ), + ) + ) + ) # Log summary end_time = datetime.now() @@ -1327,7 +1582,7 @@ def import_project(self): total_taps, total_taps - len(discover_excs), str(discover_excs), - end_time - start_time + end_time - start_time, ) if len(discover_excs) > 0: sys.exit(1) @@ -1342,7 +1597,9 @@ def encrypt_string(self): print(yaml_text) print('Encryption successful') - def _is_initial_sync_required(self, replication_method: str, stream_bookmark: Dict) -> bool: + def _is_initial_sync_required( + self, replication_method: str, stream_bookmark: Dict + ) -> bool: """ Detects if a stream needs initial sync or not. Initial sync is required for INCREMENTAL and LOG_BASED tables @@ -1360,14 +1617,20 @@ def _is_initial_sync_required(self, replication_method: str, stream_bookmark: Di :param stream_bookmark: stream state bookmark :return: Boolean, True if needs initial sync, False otherwise """ - return replication_method == self.FULL_TABLE \ - or (replication_method == self.INCREMENTAL and - 'replication_key_value' not in stream_bookmark and - 'modified_since' not in stream_bookmark) \ - or (replication_method == self.LOG_BASED and - 'lsn' not in stream_bookmark and - 'log_pos' not in stream_bookmark and - 'token' not in stream_bookmark) + return ( + replication_method == self.FULL_TABLE + or ( + replication_method == self.INCREMENTAL + and 'replication_key_value' not in stream_bookmark + and 'modified_since' not in stream_bookmark + ) + or ( + replication_method == self.LOG_BASED + and 'lsn' not in stream_bookmark + and 'log_pos' not in stream_bookmark + and 'token' not in stream_bookmark + ) + ) # pylint: disable=unused-argument def _exit_gracefully(self, sig, frame, exit_code=1): @@ -1414,7 +1677,9 @@ def _print_tap_run_summary(self, status, start_time, end_time): logfile.write(summary) # pylint: disable=unused-variable - def _run_post_import_tap_checks(self, tap: Dict, catalog: Dict, target_id: str) -> List: + def _run_post_import_tap_checks( + self, tap: Dict, catalog: Dict, target_id: str + ) -> List: """ Run post import checks on a tap. @@ -1426,10 +1691,8 @@ def _run_post_import_tap_checks(self, tap: Dict, catalog: Dict, target_id: str) errors = [] error = self.__validate_transformations( - tap.get('files', {}).get('transformation'), - catalog, - tap['id'], - target_id) + tap.get('files', {}).get('transformation'), catalog, tap['id'], target_id + ) if error: errors.append(error) @@ -1453,19 +1716,22 @@ def _run_post_import_tap_checks(self, tap: Dict, catalog: Dict, target_id: str) primary_key_required = tap.get('primary_key_required', True) # Check if primary key is set for INCREMENTAL and LOG_BASED replications - if (selected and replication_method in [self.INCREMENTAL, self.LOG_BASED] and - len(table_key_properties) == 0 and primary_key_required): - errors.append(f'No primary key set for {tap_stream_id} stream ({replication_method})') + if ( + selected + and replication_method in [self.INCREMENTAL, self.LOG_BASED] + and len(table_key_properties) == 0 + and primary_key_required + ): + errors.append( + f'No primary key set for {tap_stream_id} stream ({replication_method})' + ) break return errors def __validate_transformations( - self, - transformation_file: str, - catalog: Dict, - tap_id: str, - target_id: str) -> Optional[str]: + self, transformation_file: str, catalog: Dict, tap_id: str, target_id: str + ) -> Optional[str]: """ Run validation of transformation config Args: @@ -1480,9 +1746,9 @@ def __validate_transformations( # create a temp file with the content being the given catalog object # we need this file to execute the validation cli command - temp_catalog_file = utils.create_temp_file(dir=self.get_temp_dir(), - prefix='properties_', - suffix='.json')[1] + temp_catalog_file = utils.create_temp_file( + dir=self.get_temp_dir(), prefix='properties_', suffix='.json' + )[1] utils.save_json(catalog, temp_catalog_file) @@ -1491,7 +1757,9 @@ def __validate_transformations( """ if self.profiling_mode: - dump_file = os.path.join(self.profiling_dir, f'transformation_{tap_id}_{target_id}.pstat') + dump_file = os.path.join( + self.profiling_dir, f'transformation_{tap_id}_{target_id}.pstat' + ) command = f'{self.transform_field_python_bin} -m cProfile -o {dump_file} {command}' self.logger.debug('Transformation validation command: %s', command) diff --git a/pipelinewise/cli/tap_properties.py b/pipelinewise/cli/tap_properties.py index b90c7114e..81cae13b6 100644 --- a/pipelinewise/cli/tap_properties.py +++ b/pipelinewise/cli/tap_properties.py @@ -105,31 +105,33 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{schema_name}}-{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-postgres': { 'tap_config_extras': { # Set tap_id to locate the corresponding replication slot - 'tap_id': tap['id'] if tap else None, + 'tap_id': tap['id'] + if tap + else None, }, 'tap_stream_id_pattern': '{{schema_name}}-{{table_name}}', 'tap_stream_name_pattern': '{{schema_name}}-{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-zuora': { 'tap_config_extras': { 'username': tap.get('db_conn', {}).get('username') if tap else None, 'password': tap.get('db_conn', {}).get('password') if tap else None, 'start_date': tap.get('db_conn', {}).get('start_date') if tap else None, - 'api_type': tap.get('db_conn', {}).get('api_type') if tap else None + 'api_type': tap.get('db_conn', {}).get('api_type') if tap else None, }, 'tap_stream_id_pattern': '{{table_name}}', 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'FULL_TABLE', - 'default_data_flattening_max_level': 10 + 'default_data_flattening_max_level': 10, }, 'tap-oracle': { 'tap_config_extras': {}, @@ -137,18 +139,15 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{schema_name}}-{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-kafka': { - 'tap_config_extras': { - 'local_store_dir': temp_dir, - 'encoding': 'utf-8' - }, + 'tap_config_extras': {'local_store_dir': temp_dir, 'encoding': 'utf-8'}, 'tap_stream_id_pattern': '{{table_name}}', 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-zendesk': { 'tap_config_extras': {}, @@ -156,7 +155,7 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 10 + 'default_data_flattening_max_level': 10, }, 'tap-adwords': { 'tap_config_extras': {}, @@ -164,27 +163,23 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-jira': { - 'tap_config_extras': { - 'user_agent': 'PipelineWise - Tap Jira' - }, + 'tap_config_extras': {'user_agent': 'PipelineWise - Tap Jira'}, 'tap_stream_id_pattern': '{{table_name}}', 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-s3-csv': { - 'tap_config_extras': { - 'tables': generate_tap_s3_csv_to_table_mappings(tap) - }, + 'tap_config_extras': {'tables': generate_tap_s3_csv_to_table_mappings(tap)}, 'tap_stream_id_pattern': '{{table_name}}', 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-snowflake': { 'tap_config_extras': { @@ -195,28 +190,26 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{schema_name}}-{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-salesforce': { - 'tap_config_extras': { - 'select_fields_by_default': True - }, + 'tap_config_extras': {'select_fields_by_default': True}, 'tap_stream_id_pattern': '{{table_name}}', 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 10 + 'default_data_flattening_max_level': 10, }, 'tap-mongodb': { 'tap_config_extras': { 'database': tap.get('db_conn', {}).get('dbname') if tap else None, - 'include_schemas_in_destination_stream_name': 'true' + 'include_schemas_in_destination_stream_name': 'true', }, 'tap_stream_id_pattern': '{{database_name}}-{{table_name}}', 'tap_stream_name_pattern': '{{database_name}}-{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-google-analytics': { 'tap_config_extras': {}, @@ -224,7 +217,7 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-github': { 'tap_config_extras': { @@ -236,7 +229,7 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-shopify': { 'tap_config_extras': {}, @@ -244,7 +237,7 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-slack': { 'tap_config_extras': {}, @@ -252,19 +245,23 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-mixpanel': { 'tap_config_extras': { 'user_agent': 'PipelineWise - Tap Mixpanel', # Do not denest properties by default - 'denest_properties': tap.get('db_conn', {}).get('denest_properties', 'false') if tap else None + 'denest_properties': tap.get('db_conn', {}).get( + 'denest_properties', 'false' + ) + if tap + else None, }, 'tap_stream_id_pattern': '{{table_name}}', 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-twilio': { 'tap_config_extras': {}, @@ -272,7 +269,7 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, # Default values to use as a fallback method 'DEFAULT': { @@ -281,6 +278,6 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{schema_name}}-{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, } diff --git a/pipelinewise/cli/utils.py b/pipelinewise/cli/utils.py index 1d3dc4ff0..9015b4dcf 100644 --- a/pipelinewise/cli/utils.py +++ b/pipelinewise/cli/utils.py @@ -23,7 +23,7 @@ from ansible.module_utils._text import to_text from ansible.module_utils.common._collections_compat import Mapping from ansible.parsing.dataloader import DataLoader -from ansible.parsing.vault import (VaultLib, get_file_vault_secret, is_encrypted_file) +from ansible.parsing.vault import VaultLib, get_file_vault_secret, is_encrypted_file from ansible.parsing.yaml.loader import AnsibleLoader from ansible.parsing.yaml.objects import AnsibleMapping, AnsibleVaultEncryptedUnicode @@ -116,7 +116,9 @@ def save_json(data, path): try: LOGGER.debug('Saving JSON %s', path) with open(path, 'w', encoding='utf-8') as jsonfile: - return json.dump(data, jsonfile, cls=AnsibleJSONEncoder, indent=4, sort_keys=True) + return json.dump( + data, jsonfile, cls=AnsibleJSONEncoder, indent=4, sort_keys=True + ) except Exception as exc: raise Exception(f'Cannot save JSON {path} {exc}') from exc @@ -156,8 +158,12 @@ def get_tap_target_names(yaml_dir): (tap_yamls, target_yamls): tap_yamls is a list of names inside yaml_dir with "tap_*.y(a)ml" pattern. target_yamls is a list of names inside yaml_dir with "target_*.y(a)ml" pattern. """ - yamls = [f for f in os.listdir(yaml_dir) if os.path.isfile(os.path.join(yaml_dir, f)) - and (f.endswith('.yml') or f.endswith('.yaml'))] + yamls = [ + f + for f in os.listdir(yaml_dir) + if os.path.isfile(os.path.join(yaml_dir, f)) + and (f.endswith('.yml') or f.endswith('.yaml')) + ] target_yamls = set(filter(lambda y: y.startswith('target_'), yamls)) tap_yamls = set(filter(lambda y: y.startswith('tap_'), yamls)) @@ -196,11 +202,15 @@ def load_yaml(yaml_file, vault_secret=None): try: data = loader.get_single_data() except Exception as exc: - raise Exception(f'Error when loading YAML config at {yaml_file} {exc}') from exc + raise Exception( + f'Error when loading YAML config at {yaml_file} {exc}' + ) from exc finally: loader.dispose() except yaml.YAMLError as exc: - raise Exception(f'Error when loading YAML config at {yaml_file} {exc}') from exc + raise Exception( + f'Error when loading YAML config at {yaml_file} {exc}' + ) from exc else: LOGGER.debug('No file at %s', yaml_file) @@ -267,7 +277,9 @@ def get_sample_file_paths(): Get list of every available sample files (YAML, etc.) with absolute paths """ samples_dir = os.path.join(os.path.dirname(__file__), 'samples') - return search_files(samples_dir, patterns=['config.yml', '*.yml.sample', 'README.md'], abs_path=True) + return search_files( + samples_dir, patterns=['config.yml', '*.yml.sample', 'README.md'], abs_path=True + ) def validate(instance, schema): @@ -299,7 +311,11 @@ def delete_keys_from_dict(dic, keys): if isinstance(dic, list): return [v for v in (delete_keys_from_dict(v, keys) for v in dic) if v] # pylint: disable=C0325 # False positive on tuples - return {k: v for k, v in ((k, delete_keys_from_dict(v, keys)) for k, v in dic.items()) if k not in keys} + return { + k: v + for k, v in ((k, delete_keys_from_dict(v, keys)) for k, v in dic.items()) + if k not in keys + } def silentremove(path): @@ -327,7 +343,9 @@ def search_files(search_dir, patterns=None, sort=False, abs_path=False): # Search files and sort if required p_files = [] for pattern in patterns: - p_files.extend(filter(os.path.isfile, glob.glob(os.path.join(search_dir, pattern)))) + p_files.extend( + filter(os.path.isfile, glob.glob(os.path.join(search_dir, pattern))) + ) if sort: p_files.sort(key=os.path.getmtime, reverse=True) @@ -368,7 +386,7 @@ def extract_log_attributes(log_file): 'tap_id': tap_id, 'timestamp': timestamp, 'sync_engine': sync_engine, - 'status': status + 'status': status, } @@ -412,10 +430,11 @@ def get_tap_stream_id(tap, database_name, schema_name, table_name): """ pattern = get_tap_property(tap, 'tap_stream_id_pattern') - return pattern \ - .replace('{{database_name}}', f'{database_name}') \ - .replace('{{schema_name}}', f'{schema_name}') \ + return ( + pattern.replace('{{database_name}}', f'{database_name}') + .replace('{{schema_name}}', f'{schema_name}') .replace('{{table_name}}', f'{table_name}') + ) def get_tap_stream_name(tap, database_name, schema_name, table_name): @@ -428,10 +447,11 @@ def get_tap_stream_name(tap, database_name, schema_name, table_name): """ pattern = get_tap_property(tap, 'tap_stream_name_pattern') - return pattern \ - .replace('{{database_name}}', f'{database_name}') \ - .replace('{{schema_name}}', f'{schema_name}') \ + return ( + pattern.replace('{{database_name}}', f'{database_name}') + .replace('{{schema_name}}', f'{schema_name}') .replace('{{table_name}}', f'{table_name}') + ) def get_tap_default_replication_method(tap): @@ -498,7 +518,8 @@ def find_errors_in_log_file(file, max_errors=10, error_pattern=None): r'botocore\.exceptions\.|' # Generic python exceptions r'\.[E|e]xception|' - r'\.[E|e]rror') + r'\.[E|e]rror' + ) # Use known error patterns by default if not error_pattern: @@ -533,5 +554,6 @@ def generate_random_string(length: int = 8) -> str: if 0 < length < 8: warnings.warn('Length is too small! consider 8 or more characters') - return ''.join(secrets.choice(string.ascii_uppercase + string.digits) - for _ in range(length)) + return ''.join( + secrets.choice(string.ascii_uppercase + string.digits) for _ in range(length) + ) diff --git a/pipelinewise/fastsync/commons/errors.py b/pipelinewise/fastsync/commons/errors.py index f774367cd..c21e69796 100644 --- a/pipelinewise/fastsync/commons/errors.py +++ b/pipelinewise/fastsync/commons/errors.py @@ -1,11 +1,14 @@ class ExportError(Exception): """Raised when export fails""" + class TableNotFoundError(Exception): """Raised when configured table doesn't exist in source""" + class MongoDBInvalidDatetimeError(Exception): """Raised when a bson datetime is invalid and cannot be serialized""" + class UnsupportedKeyTypeException(Exception): """Raised if key type is unsupported""" diff --git a/pipelinewise/fastsync/commons/split_gzip.py b/pipelinewise/fastsync/commons/split_gzip.py index 8d7838915..3f49684fd 100644 --- a/pipelinewise/fastsync/commons/split_gzip.py +++ b/pipelinewise/fastsync/commons/split_gzip.py @@ -16,7 +16,14 @@ # pylint: disable=W0622,R1732 -def open(base_filename, mode='wb', chunk_size_mb=None, max_chunks=None, est_compr_rate=None, compress=True): +def open( + base_filename, + mode='wb', + chunk_size_mb=None, + max_chunks=None, + est_compr_rate=None, + compress=True, +): """Open a gzip-compressed file in binary or text mode. Args: @@ -40,7 +47,9 @@ def open(base_filename, mode='wb', chunk_size_mb=None, max_chunks=None, est_comp raise ValueError('Invalid chunk_size_mb: %d' % (chunk_size_mb,)) if max_chunks is not None and max_chunks < 0: raise ValueError('Invalid max_chunks: %d' % (max_chunks,)) - return SplitGzipFile(base_filename, mode, chunk_size_mb, max_chunks, est_compr_rate, compress) + return SplitGzipFile( + base_filename, mode, chunk_size_mb, max_chunks, est_compr_rate, compress + ) # pylint: disable=R0902 @@ -49,22 +58,27 @@ class SplitGzipFile(io.BufferedIOBase): This class only supports writing files in binary mode. """ - def __init__(self, - base_filename, - mode: str = None, - chunk_size_mb: int = None, - max_chunks: int = None, - est_compr_rate: float = None, - compress=True): + + def __init__( + self, + base_filename, + mode: str = None, + chunk_size_mb: int = None, + max_chunks: int = None, + est_compr_rate: float = None, + compress=True, + ): super().__init__() self.base_filename = base_filename self.mode = mode self.chunk_size_mb = chunk_size_mb or DEFAULT_CHUNK_SIZE_MB self.max_chunks = max_chunks if max_chunks is not None else DEFAULT_MAX_CHUNKS - self.compress= compress + self.compress = compress if compress: - self.est_compr_rate = est_compr_rate if est_compr_rate is not None else EST_COMPR_RATE + self.est_compr_rate = ( + est_compr_rate if est_compr_rate is not None else EST_COMPR_RATE + ) else: self.est_compr_rate = 1.0 self.chunk_seq = 1 @@ -85,7 +99,10 @@ def _gen_chunk_filename(self) -> str: if self.max_chunks == 0: chunk_filename = self.base_filename else: - if self.current_chunk_size_mb >= self.chunk_size_mb and self.chunk_seq < self.max_chunks: + if ( + self.current_chunk_size_mb >= self.chunk_size_mb + and self.chunk_seq < self.max_chunks + ): # Increase the chunk sequence and reset size to zero self.chunk_seq += 1 self.current_chunk_size_mb = 0 @@ -109,7 +126,9 @@ def _activate_chunk_file(self): if self.compress: self.chunk_file = gzip.open(self.chunk_filename, self.mode) else: - self.chunk_file = builtins.open(self.chunk_filename, self.mode, encoding='utf-8') + self.chunk_file = builtins.open( + self.chunk_filename, self.mode, encoding='utf-8' + ) @staticmethod def _bytes_to_megabytes(size: int) -> float: @@ -136,7 +155,9 @@ def write(self, _bytes): self._activate_chunk_file() self.chunk_file.write(_bytes) - self.current_chunk_size_mb = SplitGzipFile._bytes_to_megabytes(self.chunk_file.tell() * self.est_compr_rate) + self.current_chunk_size_mb = SplitGzipFile._bytes_to_megabytes( + self.chunk_file.tell() * self.est_compr_rate + ) def close(self): """ diff --git a/pipelinewise/fastsync/commons/tap_mongodb.py b/pipelinewise/fastsync/commons/tap_mongodb.py index f8ec7304a..ee4498a97 100644 --- a/pipelinewise/fastsync/commons/tap_mongodb.py +++ b/pipelinewise/fastsync/commons/tap_mongodb.py @@ -18,7 +18,12 @@ from singer.utils import strftime as singer_strftime from . import utils, split_gzip -from .errors import ExportError, TableNotFoundError, MongoDBInvalidDatetimeError, UnsupportedKeyTypeException +from .errors import ( + ExportError, + TableNotFoundError, + MongoDBInvalidDatetimeError, + UnsupportedKeyTypeException, +) LOGGER = logging.getLogger(__name__) DEFAULT_WRITE_BATCH_ROWS = 50000 @@ -33,8 +38,11 @@ def serialize_document(document: Dict) -> Dict: Returns: Dict """ - return {key: transform_value(val, [key]) for key, val in document.items() - if not isinstance(val, (bson.min_key.MinKey, bson.max_key.MaxKey))} + return { + key: transform_value(val, [key]) + for key, val in document.items() + if not isinstance(val, (bson.min_key.MinKey, bson.max_key.MaxKey)) + } def class_to_string(key_value: Any, key_type: str) -> str: @@ -90,15 +98,18 @@ def safe_transform_datetime(value: datetime.datetime, path) -> str: # make sense to blow up on invalid Python datetimes (e.g., # year=0). In this case we're formatting it as a string and # passing it along down the pipeline. - return '{:04d}-{:02d}-{:02d}T{:02d}:{:02d}:{:02d}.{:06d}Z'.format(value.year, - value.month, - value.day, - value.hour, - value.minute, - value.second, - value.microsecond) - raise MongoDBInvalidDatetimeError('Found invalid datetime at [{}]: {}'.format('.'.join(map(str, path)), - value)) from ex + return '{:04d}-{:02d}-{:02d}T{:02d}:{:02d}:{:02d}.{:06d}Z'.format( + value.year, + value.month, + value.day, + value.hour, + value.minute, + value.second, + value.microsecond, + ) + raise MongoDBInvalidDatetimeError( + 'Found invalid datetime at [{}]: {}'.format('.'.join(map(str, path)), value) + ) from ex return singer_strftime(utc_datetime) @@ -113,8 +124,12 @@ def transform_value(value: Any, path) -> Any: """ conversion = { - list: lambda val, pat: list(map(lambda v: transform_value(v[1], pat + [v[0]]), enumerate(val))), - dict: lambda val, pat: {k: transform_value(v, pat + [k]) for k, v in val.items()}, + list: lambda val, pat: list( + map(lambda v: transform_value(v[1], pat + [v[0]]), enumerate(val)) + ), + dict: lambda val, pat: { + k: transform_value(v, pat + [k]) for k, v in val.items() + }, uuid.UUID: lambda val, _: class_to_string(val, 'UUID'), bson.objectid.ObjectId: lambda val, _: class_to_string(val, 'ObjectId'), bson.datetime.datetime: safe_transform_datetime, @@ -124,8 +139,12 @@ def transform_value(value: Any, path) -> Any: datetime.datetime: lambda val, _: class_to_string(val, 'datetime'), bson.decimal128.Decimal128: lambda val, _: val.to_decimal(), bson.regex.Regex: lambda val, _: dict(pattern=val.pattern, flags=val.flags), - bson.code.Code: lambda val, _: dict(value=str(val), scope=str(val.scope)) if val.scope else str(val), - bson.dbref.DBRef: lambda val, _: dict(id=str(val.id), collection=val.collection, database=val.database), + bson.code.Code: lambda val, _: dict(value=str(val), scope=str(val.scope)) + if val.scope + else str(val), + bson.dbref.DBRef: lambda val, _: dict( + id=str(val.id), collection=val.collection, database=val.database + ), } if isinstance(value, tuple(conversion.keys())): @@ -147,8 +166,9 @@ def __init__(self, connection_config: Dict, tap_type_to_target_type: Callable): tap_type_to_target_type: Function that maps tap types to target ones """ self.connection_config = connection_config - self.connection_config['write_batch_rows'] = connection_config.get('write_batch_rows', - DEFAULT_WRITE_BATCH_ROWS) + self.connection_config['write_batch_rows'] = connection_config.get( + 'write_batch_rows', DEFAULT_WRITE_BATCH_ROWS + ) self.tap_type_to_target_type = tap_type_to_target_type self.database: Optional[Database] = None @@ -161,17 +181,24 @@ def open_connection(self): verify_mode = self.connection_config.get('verify_mode', 'true') == 'true' use_ssl = self.connection_config.get('ssl') == 'true' - connection_params = dict(host=self.connection_config['host'], port=int(self.connection_config['port']), - username=self.connection_config['user'], password=self.connection_config['password'], - authSource=self.connection_config['auth_database'], ssl=use_ssl, - replicaSet=self.connection_config.get('replica_set', None), - readPreference='secondaryPreferred') + connection_params = dict( + host=self.connection_config['host'], + port=int(self.connection_config['port']), + username=self.connection_config['user'], + password=self.connection_config['password'], + authSource=self.connection_config['auth_database'], + ssl=use_ssl, + replicaSet=self.connection_config.get('replica_set', None), + readPreference='secondaryPreferred', + ) # NB: "ssl_cert_reqs" must ONLY be supplied if `SSL` is true. if not verify_mode and use_ssl: connection_params['ssl_cert_reqs'] = ssl.CERT_NONE - self.database = MongoClient(**connection_params)[self.connection_config['database']] + self.database = MongoClient(**connection_params)[ + self.connection_config['database'] + ] def close_connection(self): """ @@ -180,15 +207,16 @@ def close_connection(self): self.database.client.close() # pylint: disable=R0914,R0913 - def copy_table(self, - table_name: str, - filepath: str, - temp_dir: str, - split_large_files=False, - split_file_chunk_size_mb=1000, - split_file_max_chunks=20, - compress=True - ): + def copy_table( + self, + table_name: str, + filepath: str, + temp_dir: str, + split_large_files=False, + split_file_chunk_size_mb=1000, + split_file_max_chunks=20, + compress=True, + ): """ Export data from table to a zipped csv Args: @@ -213,17 +241,23 @@ def copy_table(self, exported_rows = 0 try: - gzip_splitter = split_gzip.open(filepath, - mode='wt', - chunk_size_mb=split_file_chunk_size_mb, - max_chunks=split_file_max_chunks if split_large_files else 0, - compress=compress) - with gzip.open(export_file_path, 'rb') as export_file, gzip_splitter as gzfile: - writer = csv.DictWriter(gzfile, - fieldnames=[elem[0] for elem in self._get_collection_columns()], - delimiter=',', - quotechar='"', - quoting=csv.QUOTE_MINIMAL) + gzip_splitter = split_gzip.open( + filepath, + mode='wt', + chunk_size_mb=split_file_chunk_size_mb, + max_chunks=split_file_max_chunks if split_large_files else 0, + compress=compress, + ) + with gzip.open( + export_file_path, 'rb' + ) as export_file, gzip_splitter as gzfile: + writer = csv.DictWriter( + gzfile, + fieldnames=[elem[0] for elem in self._get_collection_columns()], + delimiter=',', + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + ) writer.writeheader() rows = [] @@ -233,15 +267,22 @@ def copy_table(self, # bson.decode_file_iter will generate one document at a time from the exported file for document in bson.decode_file_iter(export_file): try: - rows.append({ - '_ID': str(document['_id']), - 'DOCUMENT': ujson.dumps(serialize_document(document)), - utils.SDC_EXTRACTED_AT: extracted_at, - utils.SDC_BATCHED_AT: datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f'), - utils.SDC_DELETED_AT: None - }) + rows.append( + { + '_ID': str(document['_id']), + 'DOCUMENT': ujson.dumps(serialize_document(document)), + utils.SDC_EXTRACTED_AT: extracted_at, + utils.SDC_BATCHED_AT: datetime.datetime.utcnow().strftime( + '%Y-%m-%d %H:%M:%S.%f' + ), + utils.SDC_DELETED_AT: None, + } + ) except TypeError: - LOGGER.error('TypeError encountered when processing document ID: %s', document['_id']) + LOGGER.error( + 'TypeError encountered when processing document ID: %s', + document['_id'], + ) raise exported_rows += 1 @@ -251,7 +292,8 @@ def copy_table(self, LOGGER.info( 'Exporting batch from %s to %s rows from %s...', (exported_rows - write_batch_rows), - exported_rows, table_name + exported_rows, + table_name, ) writer.writerows(rows) @@ -312,14 +354,12 @@ def fetch_current_log_pos(self) -> Dict: # token can contain a property '_typeBits' of type bytes which cannot be json # serialized when saving the state in the function 'utils.save_state_file'. # '_data' is enough to resume LOG_BASED Singer replication after FastSync - return { - 'token': { - '_data': token['_data'] - } - } + return {'token': {'_data': token['_data']}} # pylint: disable=invalid-name - def fetch_current_incremental_key_pos(self, fully_qualified_table_name: str, replication_key: str): + def fetch_current_incremental_key_pos( + self, fully_qualified_table_name: str, replication_key: str + ): """ No Implemented Args: @@ -337,12 +377,11 @@ def map_column_types_to_target(self): mapped_columns = [] for column_name, column_type in self._get_collection_columns(): - mapped_columns.append(f'{column_name} {self.tap_type_to_target_type(column_type)}') + mapped_columns.append( + f'{column_name} {self.tap_type_to_target_type(column_type)}' + ) - return { - 'columns': mapped_columns, - 'primary_key': ['_ID'] - } + return {'columns': mapped_columns, 'primary_key': ['_ID']} def _export_collection(self, export_dir: str, collection_name) -> str: """ @@ -356,10 +395,12 @@ def _export_collection(self, export_dir: str, collection_name) -> str: """ LOGGER.info('Starting export of table "%s"', collection_name) - url = f'mongodb://{self.connection_config["user"]}:{self.connection_config["password"]}' \ - f'@{self.connection_config["host"]}:{self.connection_config["port"]}/' \ - f'{self.connection_config["database"]}?authSource={self.connection_config["auth_database"]}' \ - f'&readPreference=secondaryPreferred' + url = ( + f'mongodb://{self.connection_config["user"]}:{self.connection_config["password"]}' + f'@{self.connection_config["host"]}:{self.connection_config["port"]}/' + f'{self.connection_config["database"]}?authSource={self.connection_config["auth_database"]}' + f'&readPreference=secondaryPreferred' + ) if self.connection_config.get('replica_set', None) is not None: url += f'&replicaSet={self.connection_config["replica_set"]}' @@ -367,14 +408,19 @@ def _export_collection(self, export_dir: str, collection_name) -> str: if self.connection_config.get('ssl', None) is not None: url += f'&ssl={self.connection_config["ssl"]}' - return_code = subprocess.call([ - 'mongodump', - '--uri', f'"{url}"', - '--forceTableScan', - '--gzip', - '-c', collection_name, - '-o', export_dir - ]) + return_code = subprocess.call( + [ + 'mongodump', + '--uri', + f'"{url}"', + '--forceTableScan', + '--gzip', + '-c', + collection_name, + '-o', + export_dir, + ] + ) LOGGER.debug('Export command return code %s', return_code) @@ -383,5 +429,13 @@ def _export_collection(self, export_dir: str, collection_name) -> str: # mongodump creates two files "{collection_name}.metadata.json.gz" & "{collection_name}.bson.gz" # we are only interested in the latter so we delete the former. - os.remove(os.path.join(export_dir, self.connection_config['database'], f'{collection_name}.metadata.json.gz')) - return os.path.join(export_dir, self.connection_config['database'], f'{collection_name}.bson.gz') + os.remove( + os.path.join( + export_dir, + self.connection_config['database'], + f'{collection_name}.metadata.json.gz', + ) + ) + return os.path.join( + export_dir, self.connection_config['database'], f'{collection_name}.bson.gz' + ) diff --git a/pipelinewise/fastsync/commons/tap_mysql.py b/pipelinewise/fastsync/commons/tap_mysql.py index ac9628646..f8c2ab344 100644 --- a/pipelinewise/fastsync/commons/tap_mysql.py +++ b/pipelinewise/fastsync/commons/tap_mysql.py @@ -13,10 +13,12 @@ DEFAULT_CHARSET = 'utf8' DEFAULT_EXPORT_BATCH_ROWS = 50000 -DEFAULT_SESSION_SQLS = ['SET @@session.time_zone="+0:00"', - 'SET @@session.wait_timeout=28800', - 'SET @@session.net_read_timeout=3600', - 'SET @@session.innodb_lock_wait_timeout=3600'] +DEFAULT_SESSION_SQLS = [ + 'SET @@session.time_zone="+0:00"', + 'SET @@session.wait_timeout=28800', + 'SET @@session.net_read_timeout=3600', + 'SET @@session.innodb_lock_wait_timeout=3600', +] class FastSyncTapMySql: @@ -26,10 +28,15 @@ class FastSyncTapMySql: def __init__(self, connection_config, tap_type_to_target_type, target_quote=None): self.connection_config = connection_config - self.connection_config['charset'] = connection_config.get('charset', DEFAULT_CHARSET) - self.connection_config['export_batch_rows'] = connection_config.get('export_batch_rows', - DEFAULT_EXPORT_BATCH_ROWS) - self.connection_config['session_sqls'] = connection_config.get('session_sqls', DEFAULT_SESSION_SQLS) + self.connection_config['charset'] = connection_config.get( + 'charset', DEFAULT_CHARSET + ) + self.connection_config['export_batch_rows'] = connection_config.get( + 'export_batch_rows', DEFAULT_EXPORT_BATCH_ROWS + ) + self.connection_config['session_sqls'] = connection_config.get( + 'session_sqls', DEFAULT_SESSION_SQLS + ) self.tap_type_to_target_type = tap_type_to_target_type self.target_quote = target_quote self.conn = None @@ -45,24 +52,46 @@ def open_connections(self): # # If bulk_sync_{host|port|user|password} values are not defined in the config then it's # using the normal credentials to connect - host=self.connection_config.get('bulk_sync_host', self.connection_config['host']), - port=int(self.connection_config.get('bulk_sync_port', self.connection_config['port'])), - user=self.connection_config.get('bulk_sync_user', self.connection_config['user']), - password=self.connection_config.get('bulk_sync_password', self.connection_config['password']), + host=self.connection_config.get( + 'bulk_sync_host', self.connection_config['host'] + ), + port=int( + self.connection_config.get( + 'bulk_sync_port', self.connection_config['port'] + ) + ), + user=self.connection_config.get( + 'bulk_sync_user', self.connection_config['user'] + ), + password=self.connection_config.get( + 'bulk_sync_password', self.connection_config['password'] + ), charset=self.connection_config['charset'], - cursorclass=pymysql.cursors.DictCursor) + cursorclass=pymysql.cursors.DictCursor, + ) self.conn_unbuffered = pymysql.connect( # Fastsync is using bulk_sync_{host|port|user|password} values from the config by default # to avoid making heavy load on the primary source database when syncing large tables # # If bulk_sync_{host|port|user|password} values are not defined in the config then it's # using the normal credentials to connect - host=self.connection_config.get('bulk_sync_host', self.connection_config['host']), - port=int(self.connection_config.get('bulk_sync_port', self.connection_config['port'])), - user=self.connection_config.get('bulk_sync_user', self.connection_config['user']), - password=self.connection_config.get('bulk_sync_password', self.connection_config['password']), + host=self.connection_config.get( + 'bulk_sync_host', self.connection_config['host'] + ), + port=int( + self.connection_config.get( + 'bulk_sync_port', self.connection_config['port'] + ) + ), + user=self.connection_config.get( + 'bulk_sync_user', self.connection_config['user'] + ), + password=self.connection_config.get( + 'bulk_sync_password', self.connection_config['password'] + ), charset=self.connection_config['charset'], - cursorclass=pymysql.cursors.SSCursor) + cursorclass=pymysql.cursors.SSCursor, + ) # Set session variables by running a list of SQLs which is defined # in the optional session_sqls connection parameters @@ -84,7 +113,9 @@ def run_session_sqls(self): warnings.append(f'Could not set session variable: {sql}') if warnings: - LOGGER.warning('Encountered non-fatal errors when configuring session that could impact performance:') + LOGGER.warning( + 'Encountered non-fatal errors when configuring session that could impact performance:' + ) for warning in warnings: LOGGER.warning(warning) @@ -121,16 +152,22 @@ def query(self, query, conn=None, params=None, return_as_cursor=False, n_retry=1 return [] except (InterfaceError, OperationalError) as exc: - LOGGER.exception('Exception happened during running a query. Number of retries: %s. %s', n_retry, exc) + LOGGER.exception( + 'Exception happened during running a query. Number of retries: %s. %s', + n_retry, + exc, + ) if n_retry > 0: LOGGER.info('Reopening the connections.') self.close_connections(silent=True) self.open_connections() LOGGER.info('Retrying to run a query.') - return self.query(query, - params=params, - return_as_cursor=return_as_cursor, - n_retry=n_retry - 1) + return self.query( + query, + params=params, + return_as_cursor=return_as_cursor, + n_retry=n_retry - 1, + ) raise exc @@ -147,7 +184,7 @@ def fetch_current_log_pos(self): return { 'log_file': binlog_pos.get('File'), 'log_pos': binlog_pos.get('Position'), - 'version': binlog_pos.get('version', 1) + 'version': binlog_pos.get('version', 1), } # pylint: disable=invalid-name @@ -155,9 +192,13 @@ def fetch_current_incremental_key_pos(self, table, replication_key): """ Get the actual incremental key position in the table """ - result = self.query('SELECT MAX({}) AS key_value FROM {}'.format(replication_key, table)) + result = self.query( + 'SELECT MAX({}) AS key_value FROM {}'.format(replication_key, table) + ) if len(result) == 0: - raise Exception('Cannot get replication key value for table: {}'.format(table)) + raise Exception( + 'Cannot get replication key value for table: {}'.format(table) + ) mysql_key_value = result[0].get('key_value') key_value = mysql_key_value @@ -175,7 +216,7 @@ def fetch_current_incremental_key_pos(self, table, replication_key): return { 'replication_key': replication_key, 'replication_key_value': key_value, - 'version': 1 + 'version': 1, } def get_primary_keys(self, table_name): @@ -183,11 +224,15 @@ def get_primary_keys(self, table_name): Get the primary key of a table """ table_dict = utils.tablename_to_dict(table_name) - sql = "SHOW KEYS FROM `{}`.`{}` WHERE Key_name = 'PRIMARY'".format(table_dict['schema_name'], - table_dict['table_name']) + sql = "SHOW KEYS FROM `{}`.`{}` WHERE Key_name = 'PRIMARY'".format( + table_dict['schema_name'], table_dict['table_name'] + ) pk_specs = self.query(sql) if len(pk_specs) > 0: - return [safe_column_name(k.get('Column_name'), self.target_quote) for k in pk_specs] + return [ + safe_column_name(k.get('Column_name'), self.target_quote) + for k in pk_specs + ] return None @@ -251,7 +296,7 @@ def get_table_columns(self, table_name, max_num=None, date_type='date'): AND table_name = '{table_name}') x ORDER BY ordinal_position - """ + """ # noqa: E501 return self.query(sql) def map_column_types_to_target(self, table_name): @@ -260,26 +305,32 @@ def map_column_types_to_target(self, table_name): """ mysql_columns = self.get_table_columns(table_name) mapped_columns = [ - '{} {}'.format(safe_column_name(pc.get('column_name'), self.target_quote), - self.tap_type_to_target_type(pc.get('data_type'), pc.get('column_type'))) - for pc in mysql_columns] + '{} {}'.format( + safe_column_name(pc.get('column_name'), self.target_quote), + self.tap_type_to_target_type( + pc.get('data_type'), pc.get('column_type') + ), + ) + for pc in mysql_columns + ] return { 'columns': mapped_columns, - 'primary_key': self.get_primary_keys(table_name) + 'primary_key': self.get_primary_keys(table_name), } # pylint: disable=too-many-locals - def copy_table(self, - table_name, - path, - max_num=None, - date_type='date', - split_large_files=False, - split_file_chunk_size_mb=1000, - split_file_max_chunks=20, - compress=True, - ): + def copy_table( + self, + table_name, + path, + max_num=None, + date_type='date', + split_large_files=False, + split_file_chunk_size_mb=1000, + split_file_max_chunks=20, + compress=True, + ): """ Export data from table to a zipped csv Args: @@ -303,24 +354,30 @@ def copy_table(self, ,CONVERT_TZ( NOW(),@@session.time_zone,'+00:00') AS _SDC_BATCHED_AT ,null AS _SDC_DELETED_AT FROM `{}`.`{}` - """.format(','.join(column_safe_sql_values), - table_dict['schema_name'], - table_dict['table_name']) + """.format( + ','.join(column_safe_sql_values), + table_dict['schema_name'], + table_dict['table_name'], + ) export_batch_rows = self.connection_config['export_batch_rows'] exported_rows = 0 with self.conn_unbuffered as cur: cur.execute(sql) - gzip_splitter = split_gzip.open(path, - mode='wt', - chunk_size_mb=split_file_chunk_size_mb, - max_chunks=split_file_max_chunks if split_large_files else 0, - compress=compress) + gzip_splitter = split_gzip.open( + path, + mode='wt', + chunk_size_mb=split_file_chunk_size_mb, + max_chunks=split_file_max_chunks if split_large_files else 0, + compress=compress, + ) with gzip_splitter as split_gzip_files: - writer = csv.writer(split_gzip_files, - delimiter=',', - quotechar='"', - quoting=csv.QUOTE_MINIMAL) + writer = csv.writer( + split_gzip_files, + delimiter=',', + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + ) while True: rows = cur.fetchmany(export_batch_rows) @@ -335,9 +392,14 @@ def copy_table(self, # Then we believe this to be just an interim batch and not the final one so report on progress LOGGER.info( - 'Exporting batch from %s to %s rows from %s...', (exported_rows - export_batch_rows), - exported_rows, table_name) + 'Exporting batch from %s to %s rows from %s...', + (exported_rows - export_batch_rows), + exported_rows, + table_name, + ) # Write rows to file in one go writer.writerows(rows) - LOGGER.info('Exported total of %s rows from %s...', exported_rows, table_name) + LOGGER.info( + 'Exported total of %s rows from %s...', exported_rows, table_name + ) diff --git a/pipelinewise/fastsync/commons/tap_postgres.py b/pipelinewise/fastsync/commons/tap_postgres.py index b5e7285d1..ddfcc40af 100644 --- a/pipelinewise/fastsync/commons/tap_postgres.py +++ b/pipelinewise/fastsync/commons/tap_postgres.py @@ -52,7 +52,12 @@ def generate_replication_slot_name(dbname, tap_id=None, prefix='pipelinewise'): return re.sub('[^a-z0-9_]', '_', slot_name) @classmethod - def __get_slot_name(cls, connection, dbname: str, tap_id: str,) -> str: + def __get_slot_name( + cls, + connection, + dbname: str, + tap_id: str, + ) -> str: """ Finds the right slot name to use and returns it @@ -74,7 +79,9 @@ def __get_slot_name(cls, connection, dbname: str, tap_id: str,) -> str: try: # Backward compatibility: try to locate existing v15 slot first. PPW <= 0.15.0 with connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - cur.execute(f"SELECT * FROM pg_replication_slots WHERE slot_name = '{slot_name_v15}';") + cur.execute( + f"SELECT * FROM pg_replication_slots WHERE slot_name = '{slot_name_v15}';" + ) v15_slots_count = cur.rowcount except psycopg2.Error: @@ -102,13 +109,17 @@ def drop_slot(cls, connection_config: Dict) -> None: LOGGER.debug('Connection to Primary server created.') try: - slot_name = cls.__get_slot_name(connection, connection_config['dbname'], connection_config['tap_id']) + slot_name = cls.__get_slot_name( + connection, connection_config['dbname'], connection_config['tap_id'] + ) LOGGER.info('Dropping the slot "%s"', slot_name) # drop the replication host with connection.cursor() as cur: - cur.execute(f'SELECT pg_drop_replication_slot(slot_name) ' - f"FROM pg_replication_slots WHERE slot_name = '{slot_name}';") + cur.execute( + f'SELECT pg_drop_replication_slot(slot_name) ' + f"FROM pg_replication_slots WHERE slot_name = '{slot_name}';" + ) LOGGER.info('Number of dropped slots: %s', cur.rowcount) finally: @@ -135,7 +146,8 @@ def get_connection(cls, connection_config: Dict, prioritize_primary: bool = Fals connection_config['port'], connection_config['user'], connection_config['password'], - connection_config['dbname']) + connection_config['dbname'], + ) else: LOGGER.info('Connecting to replica') conn_string = template.format( @@ -147,8 +159,11 @@ def get_connection(cls, connection_config: Dict, prioritize_primary: bool = Fals connection_config.get('replica_host', connection_config['host']), connection_config.get('replica_port', connection_config['port']), connection_config.get('replica_user', connection_config['user']), - connection_config.get('replica_password', connection_config['password']), - connection_config['dbname']) + connection_config.get( + 'replica_password', connection_config['password'] + ), + connection_config['dbname'], + ) if 'ssl' in connection_config and connection_config['ssl'] == 'true': conn_string += " sslmode='require'" @@ -166,7 +181,9 @@ def open_connection(self): """ Open connection """ - self.conn = self.get_connection(self.connection_config, prioritize_primary=False) + self.conn = self.get_connection( + self.connection_config, prioritize_primary=False + ) self.curr = self.conn.cursor() def close_connection(self): @@ -221,12 +238,16 @@ def create_replication_slot(self): replication slot and full-resync the new taps. """ try: - slot_name = self.__get_slot_name(self.primary_host_conn, - self.connection_config['dbname'], - self.connection_config['tap_id']) + slot_name = self.__get_slot_name( + self.primary_host_conn, + self.connection_config['dbname'], + self.connection_config['tap_id'], + ) # Create the replication host - self.primary_host_query(f"SELECT * FROM pg_create_logical_replication_slot('{slot_name}', 'wal2json')") + self.primary_host_query( + f"SELECT * FROM pg_create_logical_replication_slot('{slot_name}', 'wal2json')" + ) except Exception as exc: # ERROR: replication slot already exists SQL state: 42710 if hasattr(exc, 'pgcode') and exc.pgcode == '42710': @@ -241,12 +262,15 @@ def fetch_current_log_pos(self): """ # Create replication slot dedicated connection # Always use Primary server for creating replication_slot - self.primary_host_conn = self.get_connection(self.connection_config, prioritize_primary=True) + self.primary_host_conn = self.get_connection( + self.connection_config, prioritize_primary=True + ) self.primary_host_curr = self.primary_host_conn.cursor() # Make sure PostgreSQL version is 9.4 or higher result = self.primary_host_query( - "SELECT setting::int AS version FROM pg_settings WHERE name='server_version_num'") + "SELECT setting::int AS version FROM pg_settings WHERE name='server_version_num'" + ) version = result[0].get('version') # Do not allow minor versions with PostgreSQL BUG #15114 @@ -275,9 +299,13 @@ def fetch_current_log_pos(self): if version >= 100000: result = self.query('SELECT pg_last_wal_replay_lsn() AS current_lsn') elif version >= 90400: - result = self.query('SELECT pg_last_xlog_replay_location() AS current_lsn') + result = self.query( + 'SELECT pg_last_xlog_replay_location() AS current_lsn' + ) else: - raise Exception('Logical replication not supported before PostgreSQL 9.4') + raise Exception( + 'Logical replication not supported before PostgreSQL 9.4' + ) else: # Get current lsn from primary host if version >= 100000: @@ -285,16 +313,15 @@ def fetch_current_log_pos(self): elif version >= 90400: result = self.query('SELECT pg_current_xlog_location() AS current_lsn') else: - raise Exception('Logical replication not supported before PostgreSQL 9.4') + raise Exception( + 'Logical replication not supported before PostgreSQL 9.4' + ) current_lsn = result[0].get('current_lsn') file, index = current_lsn.split('/') lsn = (int(file, 16) << 32) + int(index, 16) - return { - 'lsn': lsn, - 'version': 1 - } + return {'lsn': lsn, 'version': 1} # pylint: disable=invalid-name def fetch_current_incremental_key_pos(self, table, replication_key): @@ -302,9 +329,13 @@ def fetch_current_incremental_key_pos(self, table, replication_key): Get the actual incremental key position in the table """ schema_name, table_name = table.split('.') - result = self.query(f'SELECT MAX({replication_key}) AS key_value FROM {schema_name}."{table_name}"') + result = self.query( + f'SELECT MAX({replication_key}) AS key_value FROM {schema_name}."{table_name}"' + ) if len(result) == 0: - raise Exception('Cannot get replication key value for table: {}'.format(table)) + raise Exception( + 'Cannot get replication key value for table: {}'.format(table) + ) postgres_key_value = result[0].get('key_value') key_value = postgres_key_value @@ -322,7 +353,7 @@ def fetch_current_incremental_key_pos(self, table, replication_key): return { 'replication_key': replication_key, 'replication_key_value': key_value, - 'version': 1 + 'version': 1, } def get_primary_keys(self, table): @@ -339,7 +370,9 @@ def get_primary_keys(self, table): pg_class.relnamespace = pg_namespace.oid AND pg_attribute.attrelid = pg_class.oid AND pg_attribute.attnum = any(pg_index.indkey) - AND indisprimary""".format(schema_name, table_name) + AND indisprimary""".format( + schema_name, table_name + ) pk_specs = self.query(sql) if len(pk_specs) > 0: return [safe_column_name(k[0], self.target_quote) for k in pk_specs] @@ -355,7 +388,7 @@ def get_table_columns(self, table_name, max_num=None, date_type='date'): if max_num: decimals = len(max_num.split('.')[1]) if '.' in max_num else 0 decimal_format = f""" - 'CASE WHEN "' || column_name || '" IS NULL THEN NULL ELSE GREATEST(LEAST({max_num}, ROUND("' || column_name || '"::numeric , {decimals})), -{max_num}) END' + 'CASE WHEN "' || column_name || '" IS NULL THEN NULL ELSE GREATEST(LEAST({max_num}, ROUND("' || column_name || '"::numeric , {decimals})), -{max_num}) END' # noqa E501 """ integer_format = """ '"' || column_name || '"' @@ -397,7 +430,7 @@ def get_table_columns(self, table_name, max_num=None, date_type='date'): AND table_name = '{table_name}' ORDER BY ordinal_position ) AS x - """ + """ # noqa: E501 return self.query(sql) def map_column_types_to_target(self, table_name): @@ -412,25 +445,28 @@ def map_column_types_to_target(self, table_name): # most targets would want to map length 1 to boolean and the rest to number if isinstance(column_type, list): column_type = column_type[1 if pc[3] > 1 else 0] - mapping = '{} {}'.format(safe_column_name(pc[0], self.target_quote), column_type) + mapping = '{} {}'.format( + safe_column_name(pc[0], self.target_quote), column_type + ) mapped_columns.append(mapping) return { 'columns': mapped_columns, - 'primary_key': self.get_primary_keys(table_name) + 'primary_key': self.get_primary_keys(table_name), } # pylint: disable=too-many-arguments - def copy_table(self, - table_name, - path, - max_num=None, - date_type='date', - split_large_files=False, - split_file_chunk_size_mb=1000, - split_file_max_chunks=20, - compress=True - ): + def copy_table( + self, + table_name, + path, + max_num=None, + date_type='date', + split_large_files=False, + split_file_chunk_size_mb=1000, + split_file_max_chunks=20, + compress=True, + ): """ Export data from table to a zipped csv Args: @@ -455,14 +491,18 @@ def copy_table(self, ,now() AT TIME ZONE 'UTC' ,null FROM {}."{}") TO STDOUT with CSV DELIMITER ',' - """.format(','.join(column_safe_sql_values), schema_name, table_name) + """.format( + ','.join(column_safe_sql_values), schema_name, table_name + ) LOGGER.info('Exporting data: %s', sql) - gzip_splitter = split_gzip.open(path, - mode='wb', - chunk_size_mb=split_file_chunk_size_mb, - max_chunks=split_file_max_chunks if split_large_files else 0, - compress=compress) + gzip_splitter = split_gzip.open( + path, + mode='wb', + chunk_size_mb=split_file_chunk_size_mb, + max_chunks=split_file_max_chunks if split_large_files else 0, + compress=compress, + ) with gzip_splitter as split_gzip_files: self.curr.copy_expert(sql, split_gzip_files, size=131072) diff --git a/pipelinewise/fastsync/commons/tap_s3_csv.py b/pipelinewise/fastsync/commons/tap_s3_csv.py index e23f60e42..772982a17 100644 --- a/pipelinewise/fastsync/commons/tap_s3_csv.py +++ b/pipelinewise/fastsync/commons/tap_s3_csv.py @@ -9,7 +9,14 @@ from datetime import datetime from time import struct_time from typing import Callable, Dict, List, Optional, Set -from messytables import (CSVTableSet, headers_guess, headers_processor, jts, offset_processor, type_guess) +from messytables import ( + CSVTableSet, + headers_guess, + headers_processor, + jts, + offset_processor, + type_guess, +) from singer.utils import strptime_with_tz from singer_encodings import csv as singer_encodings_csv @@ -26,7 +33,12 @@ class FastSyncTapS3Csv: """ # pylint: disable=bare-except - def __init__(self, connection_config: Dict, tap_type_to_target_type: Callable, target_quote=None): + def __init__( + self, + connection_config: Dict, + tap_type_to_target_type: Callable, + target_quote=None, + ): """ Constructor :param connection_config: tap connection config @@ -34,10 +46,16 @@ def __init__(self, connection_config: Dict, tap_type_to_target_type: Callable, t """ try: # Check if bucket can be accessed without credentials/assuming role - list(S3Helper.list_files_in_bucket(connection_config['bucket'], - connection_config.get('aws_endpoint_url', None))) - LOGGER.info('I have direct access to the bucket without assuming the configured role.') - except: + list( + S3Helper.list_files_in_bucket( + connection_config['bucket'], + connection_config.get('aws_endpoint_url', None), + ) + ) + LOGGER.info( + 'I have direct access to the bucket without assuming the configured role.' + ) + except Exception: # Setup AWS session S3Helper.setup_aws_client(connection_config) @@ -48,7 +66,12 @@ def __init__(self, connection_config: Dict, tap_type_to_target_type: Callable, t def _find_table_spec_by_name(self, table_name: str) -> Dict: # look in tables array for the full specs dict of given table - return next(filter(lambda x: x['table_name'] == table_name, self.connection_config['tables'])) + return next( + filter( + lambda x: x['table_name'] == table_name, + self.connection_config['tables'], + ) + ) def copy_table(self, table_name: str, file_path: str) -> None: """ @@ -67,7 +90,9 @@ def copy_table(self, table_name: str, file_path: str) -> None: modified_since = strptime_with_tz(self.connection_config['start_date']) # get all the files in the bucket that match the criteria and were modified after start date - s3_files = S3Helper.get_input_files_for_table(self.connection_config, table_spec, modified_since) + s3_files = S3Helper.get_input_files_for_table( + self.connection_config, table_spec, modified_since + ) # variable to hold all the records from all matching files records = [] @@ -84,7 +109,10 @@ def copy_table(self, table_name: str, file_path: str) -> None: self._get_file_records(s3_file['key'], table_spec, records, headers) # check if the current file has the most recent modification date - if max_last_modified is None or max_last_modified < s3_file['last_modified']: + if ( + max_last_modified is None + or max_last_modified < s3_file['last_modified'] + ): max_last_modified = s3_file['last_modified'] # add the found last modified date to the dictionary @@ -93,19 +121,23 @@ def copy_table(self, table_name: str, file_path: str) -> None: # write to the given compressed csv file with gzip.open(file_path, 'wt') as gzfile: - writer = csv.DictWriter(gzfile, - fieldnames=sorted(list(headers)), - # we need to sort the headers so that copying into snowflake works - delimiter=',', - quotechar='"', - quoting=csv.QUOTE_MINIMAL) + writer = csv.DictWriter( + gzfile, + fieldnames=sorted(list(headers)), + # we need to sort the headers so that copying into snowflake works + delimiter=',', + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + ) # write the header writer.writeheader() # write all records at once writer.writerows(records) # pylint: disable=too-many-locals - def _get_file_records(self, s3_path: str, table_spec: Dict, records: List[Dict], headers: Set) -> None: + def _get_file_records( + self, s3_path: str, table_spec: Dict, records: List[Dict], headers: Set + ) -> None: """ Reads the file in s3_path and inserts the rows in records :param config: tap connection configuration @@ -129,7 +161,9 @@ def _get_file_records(self, s3_path: str, table_spec: Dict, records: List[Dict], csv.field_size_limit(sys.maxsize) # pylint:disable=protected-access - iterator = singer_encodings_csv.get_row_iterator(s3_file_handle._raw_stream, table_spec) + iterator = singer_encodings_csv.get_row_iterator( + s3_file_handle._raw_stream, table_spec + ) records_copied = len(records) @@ -141,7 +175,7 @@ def _get_file_records(self, s3_path: str, table_spec: Dict, records: List[Dict], S3Helper.SDC_SOURCE_LINENO_COLUMN: records_copied + 1, '_SDC_EXTRACTED_AT': now_datetime, '_SDC_BATCHED_AT': now_datetime, - '_SDC_DELETED_AT': None + '_SDC_DELETED_AT': None, } new_row = {} @@ -171,20 +205,26 @@ def map_column_types_to_target(self, filepath: str, table: str): # use timestamp as a type instead if column is set in date_overrides configuration mapped_columns = [] - date_overrides = None if 'date_overrides' not in specs \ - else {safe_column_name(c, self.target_quote) for c in specs['date_overrides']} + date_overrides = ( + None + if 'date_overrides' not in specs + else { + safe_column_name(c, self.target_quote) for c in specs['date_overrides'] + } + ) for column_name, column_type in csv_columns: if date_overrides and column_name in date_overrides: - mapped_columns.append(f'{column_name} {self.tap_type_to_target_type("date_override")}') + mapped_columns.append( + f'{column_name} {self.tap_type_to_target_type("date_override")}' + ) else: - mapped_columns.append(f'{column_name} {self.tap_type_to_target_type(column_type)}') + mapped_columns.append( + f'{column_name} {self.tap_type_to_target_type(column_type)}' + ) - return { - 'columns': mapped_columns, - 'primary_key': self._get_primary_keys(specs) - } + return {'columns': mapped_columns, 'primary_key': self._get_primary_keys(specs)} def _get_table_columns(self, csv_file_path: str) -> zip: """ @@ -203,12 +243,15 @@ def _get_table_columns(self, csv_file_path: str) -> zip: row_set.register_processor(offset_processor(offset + 1)) - types = list(map(jts.celltype_as_string, type_guess(row_set.sample, strict=True))) + types = list( + map(jts.celltype_as_string, type_guess(row_set.sample, strict=True)) + ) return zip(headers, types) # pylint: disable=invalid-name - def fetch_current_incremental_key_pos(self, table: str, - replication_key: Optional[str] = 'modified_since') -> Optional[Dict]: + def fetch_current_incremental_key_pos( + self, table: str, replication_key: Optional[str] = 'modified_since' + ) -> Optional[Dict]: """ Returns the last time a the table has been modified in ISO format. :param table: table name @@ -217,9 +260,11 @@ def fetch_current_incremental_key_pos(self, table: str, """ replication_key = 'modified_since' - return { - replication_key: self.tables_last_modified[table].isoformat() - } if table in self.tables_last_modified else {} + return ( + {replication_key: self.tables_last_modified[table].isoformat()} + if table in self.tables_last_modified + else {} + ) def _get_primary_keys(self, table_specs: Dict) -> Optional[List]: """ @@ -229,7 +274,10 @@ def _get_primary_keys(self, table_specs: Dict) -> Optional[List]: :return: the keys concatenated and separated by comma if keys are given, otherwise None """ if table_specs.get('key_properties', False): - return [safe_column_name(k, self.target_quote) for k in table_specs['key_properties']] + return [ + safe_column_name(k, self.target_quote) + for k in table_specs['key_properties'] + ] return None @@ -240,6 +288,7 @@ class S3Helper: """ S3 helper methods """ + SDC_SOURCE_BUCKET_COLUMN = '_sdc_source_bucket' SDC_SOURCE_FILE_COLUMN = '_sdc_source_file' SDC_SOURCE_LINENO_COLUMN = '_sdc_source_lineno' @@ -254,9 +303,15 @@ def setup_aws_client(cls, config: Dict) -> None: LOGGER.info('Attempting to create AWS session') # Get the required parameters from config file and/or environment variables - aws_access_key_id = config.get('aws_access_key_id') or os.environ.get('AWS_ACCESS_KEY_ID') - aws_secret_access_key = config.get('aws_secret_access_key') or os.environ.get('AWS_SECRET_ACCESS_KEY') - aws_session_token = config.get('aws_session_token') or os.environ.get('AWS_SESSION_TOKEN') + aws_access_key_id = config.get('aws_access_key_id') or os.environ.get( + 'AWS_ACCESS_KEY_ID' + ) + aws_secret_access_key = config.get('aws_secret_access_key') or os.environ.get( + 'AWS_SECRET_ACCESS_KEY' + ) + aws_session_token = config.get('aws_session_token') or os.environ.get( + 'AWS_SESSION_TOKEN' + ) aws_profile = config.get('aws_profile') or os.environ.get('AWS_PROFILE') # AWS credentials based authentication @@ -264,14 +319,16 @@ def setup_aws_client(cls, config: Dict) -> None: boto3.setup_default_session( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, - aws_session_token=aws_session_token + aws_session_token=aws_session_token, ) # AWS Profile based authentication, will use IAM role if no profile is found else: boto3.setup_default_session(profile_name=aws_profile) @classmethod - def get_input_files_for_table(cls, config: Dict, table_spec: Dict, modified_since: struct_time = None): + def get_input_files_for_table( + cls, config: Dict, table_spec: Dict, modified_since: struct_time = None + ): bucket = config['bucket'] prefix = table_spec.get('search_prefix') pattern = table_spec['search_pattern'] @@ -279,10 +336,14 @@ def get_input_files_for_table(cls, config: Dict, table_spec: Dict, modified_sinc try: matcher = re.compile(pattern) except re.error as exc: - raise ValueError((f'search_pattern for table `{table_spec["table_name"]}` is not a valid regular ' - 'expression. See ' - 'https://docs.python.org/3.5/library/re.html#regular-expression-syntax'), - pattern) from exc + raise ValueError( + ( + f'search_pattern for table `{table_spec["table_name"]}` is not a valid regular ' + 'expression. See ' + 'https://docs.python.org/3.5/library/re.html#regular-expression-syntax' + ), + pattern, + ) from exc LOGGER.info('Checking bucket "%s" for keys matching "%s"', bucket, pattern) @@ -290,7 +351,9 @@ def get_input_files_for_table(cls, config: Dict, table_spec: Dict, modified_sinc unmatched_files_count = 0 max_files_before_log = 30000 - for s3_object in cls.list_files_in_bucket(bucket, prefix, aws_endpoint_url=config.get('aws_endpoint_url')): + for s3_object in cls.list_files_in_bucket( + bucket, prefix, aws_endpoint_url=config.get('aws_endpoint_url') + ): key = s3_object['Key'] last_modified = s3_object['LastModified'] @@ -302,29 +365,48 @@ def get_input_files_for_table(cls, config: Dict, table_spec: Dict, modified_sinc if matcher.search(key): matched_files_count += 1 if modified_since is None or modified_since < last_modified: - LOGGER.info('Will download key "%s" as it was last modified %s', key, last_modified) + LOGGER.info( + 'Will download key "%s" as it was last modified %s', + key, + last_modified, + ) yield {'key': key, 'last_modified': last_modified} else: unmatched_files_count += 1 - if (unmatched_files_count + matched_files_count) % max_files_before_log == 0: + if ( + unmatched_files_count + matched_files_count + ) % max_files_before_log == 0: # Are we skipping greater than 50% of the files? # pylint: disable=old-division - if (unmatched_files_count / (matched_files_count + unmatched_files_count)) > 0.5: - LOGGER.info('Found %s matching files and %s non-matching files. ' - 'You should consider adding a `search_prefix` to the config ' - 'or removing non-matching files from the bucket.', - matched_files_count, unmatched_files_count) + if ( + unmatched_files_count + / (matched_files_count + unmatched_files_count) + ) > 0.5: + LOGGER.info( + 'Found %s matching files and %s non-matching files. ' + 'You should consider adding a `search_prefix` to the config ' + 'or removing non-matching files from the bucket.', + matched_files_count, + unmatched_files_count, + ) else: - LOGGER.info('Found %s matching files and %s non-matching files', - matched_files_count, unmatched_files_count) + LOGGER.info( + 'Found %s matching files and %s non-matching files', + matched_files_count, + unmatched_files_count, + ) if matched_files_count == 0: if prefix: - raise Exception(f'No files found in bucket "{bucket}" ' - f'that matches prefix "{prefix}" and pattern "{pattern}"') + raise Exception( + f'No files found in bucket "{bucket}" ' + f'that matches prefix "{prefix}" and pattern "{pattern}"' + ) - raise Exception(f'No files found in bucket "{bucket}" that matches pattern "{pattern}"') + raise Exception( + f'No files found in bucket "{bucket}" that matches pattern "{pattern}"' + ) @classmethod @retry_pattern() @@ -356,7 +438,11 @@ def list_files_in_bucket(cls, bucket, search_prefix=None, aws_endpoint_url=None) if s3_object_count > 0: LOGGER.info('Found %s files.', s3_object_count) else: - LOGGER.info('Found no files for bucket "%s" that match prefix "%s"', bucket, search_prefix) + LOGGER.info( + 'Found no files for bucket "%s" that match prefix "%s"', + bucket, + search_prefix, + ) @classmethod @retry_pattern() diff --git a/pipelinewise/fastsync/commons/target_bigquery.py b/pipelinewise/fastsync/commons/target_bigquery.py index 60ceb226d..aa5a10a1e 100644 --- a/pipelinewise/fastsync/commons/target_bigquery.py +++ b/pipelinewise/fastsync/commons/target_bigquery.py @@ -33,6 +33,7 @@ class FastSyncTargetBigquery: """ Common functions for fastsync to BigQuery """ + def __init__(self, connection_config, transformation_config=None): self.connection_config = connection_config self.transformation_config = transformation_config @@ -48,7 +49,7 @@ def to_query_parameter(value): value_type = 'INT64' elif isinstance(value, float): value_type = 'NUMERIC' - #TODO: repeated float here and in target + # TODO: repeated float here and in target elif isinstance(value, float): value_type = 'FLOAT64' elif isinstance(value, bool): @@ -91,27 +92,45 @@ def create_schema(self, schema_name): def drop_table(self, target_schema, table_name, is_temporary=False): table_dict = utils.tablename_to_dict(table_name) - target_table = safe_name(table_dict.get('table_name' if not is_temporary else 'temp_table_name')) + target_table = safe_name( + table_dict.get('table_name' if not is_temporary else 'temp_table_name') + ) sql = 'DROP TABLE IF EXISTS {}.{}'.format(target_schema, target_table.lower()) self.query(sql) - def create_table(self, target_schema: str, table_name: str, columns: List[str], - is_temporary: bool = False, sort_columns=False): + def create_table( + self, + target_schema: str, + table_name: str, + columns: List[str], + is_temporary: bool = False, + sort_columns=False, + ): table_dict = utils.tablename_to_dict(table_name) - target_table = safe_name(table_dict.get('table_name' if not is_temporary else 'temp_table_name').lower()) + target_table = safe_name( + table_dict.get( + 'table_name' if not is_temporary else 'temp_table_name' + ).lower() + ) # skip the EXTRACTED, BATCHED and DELETED columns in case they exist because they gonna be added later - columns = [c for c in columns if not ( - c.upper().startswith(utils.SDC_EXTRACTED_AT.upper()) or - c.upper().startswith(utils.SDC_BATCHED_AT.upper()) or - c.upper().startswith(utils.SDC_DELETED_AT.upper()))] - - columns += [f'{utils.SDC_EXTRACTED_AT} TIMESTAMP', - f'{utils.SDC_BATCHED_AT} TIMESTAMP', - f'{utils.SDC_DELETED_AT} TIMESTAMP' - ] + columns = [ + c + for c in columns + if not ( + c.upper().startswith(utils.SDC_EXTRACTED_AT.upper()) + or c.upper().startswith(utils.SDC_BATCHED_AT.upper()) + or c.upper().startswith(utils.SDC_DELETED_AT.upper()) + ) + ] + + columns += [ + f'{utils.SDC_EXTRACTED_AT} TIMESTAMP', + f'{utils.SDC_BATCHED_AT} TIMESTAMP', + f'{utils.SDC_DELETED_AT} TIMESTAMP', + ] # We need the sort the columns for some taps( for now tap-s3-csv) # because later on when copying a csv file into Snowflake @@ -121,18 +140,33 @@ def create_table(self, target_schema: str, table_name: str, columns: List[str], columns = [c.lower() for c in columns] - sql = f'CREATE OR REPLACE TABLE {target_schema}.{target_table} (' \ - f'{",".join(columns)})' + sql = ( + f'CREATE OR REPLACE TABLE {target_schema}.{target_table} (' + f'{",".join(columns)})' + ) self.query(sql) # pylint: disable=R0913,R0914 - def copy_to_table(self, filepath, target_schema, table_name, size_bytes, is_temporary, - skip_csv_header=False, allow_quoted_newlines=True, write_truncate=True): + def copy_to_table( + self, + filepath, + target_schema, + table_name, + size_bytes, + is_temporary, + skip_csv_header=False, + allow_quoted_newlines=True, + write_truncate=True, + ): LOGGER.info('BIGQUERY - Loading %s into Bigquery...', filepath) table_dict = utils.tablename_to_dict(table_name) - target_table = safe_name(table_dict.get('table_name' if not is_temporary else 'temp_table_name').lower(), - quotes=False) + target_table = safe_name( + table_dict.get( + 'table_name' if not is_temporary else 'temp_table_name' + ).lower(), + quotes=False, + ) client = self.open_connection() dataset_ref = client.dataset(target_schema) @@ -141,11 +175,15 @@ def copy_to_table(self, filepath, target_schema, table_name, size_bytes, is_temp job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.CSV job_config.schema = table_schema - job_config.write_disposition = 'WRITE_TRUNCATE' if write_truncate else 'WRITE_APPEND' + job_config.write_disposition = ( + 'WRITE_TRUNCATE' if write_truncate else 'WRITE_APPEND' + ) job_config.allow_quoted_newlines = allow_quoted_newlines job_config.skip_leading_rows = 1 if skip_csv_header else 0 with open(filepath, 'rb') as exported_data: - job = client.load_table_from_file(exported_data, table_ref, job_config=job_config) + job = client.load_table_from_file( + exported_data, table_ref, job_config=job_config + ) try: job.result() except exceptions.BadRequest as exc: @@ -156,10 +194,12 @@ def copy_to_table(self, filepath, target_schema, table_name, size_bytes, is_temp LOGGER.info('Job %s', job) LOGGER.info('Job.output_rows %s', job.output_rows) inserts = job.output_rows - LOGGER.info('Loading into %s."%s": %s', - target_schema, - target_table, - json.dumps({'inserts': inserts, 'updates': 0, 'size_bytes': size_bytes})) + LOGGER.info( + 'Loading into %s."%s": %s', + target_schema, + target_table, + json.dumps({'inserts': inserts, 'updates': 0, 'size_bytes': size_bytes}), + ) LOGGER.info(job.errors) @@ -167,12 +207,18 @@ def copy_to_table(self, filepath, target_schema, table_name, size_bytes, is_temp # "to_group" is not used here but exists for compatibility reasons with other database types # "to_group" is for databases that can grant to users and groups separately like Amazon Redshift # pylint: disable=unused-argument - def grant_select_on_table(self, target_schema, table_name, role, is_temporary, to_group=False): + def grant_select_on_table( + self, target_schema, table_name, role, is_temporary, to_group=False + ): # Grant role is not mandatory parameter, do nothing if not specified if role: table_dict = utils.tablename_to_dict(table_name) - target_table = safe_name(table_dict.get('table_name' if not is_temporary else 'temp_table_name')) - sql = 'GRANT SELECT ON {}.{} TO ROLE {}'.format(target_schema, target_table, role) + target_table = safe_name( + table_dict.get('table_name' if not is_temporary else 'temp_table_name') + ) + sql = 'GRANT SELECT ON {}.{} TO ROLE {}'.format( + target_schema, target_table, role + ) self.query(sql) # pylint: disable=unused-argument @@ -186,7 +232,9 @@ def grant_usage_on_schema(self, target_schema, role, to_group=False): def grant_select_on_schema(self, target_schema, role, to_group=False): # Grant role is not mandatory parameter, do nothing if not specified if role: - sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO ROLE {}'.format(target_schema, role) + sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO ROLE {}'.format( + target_schema, role + ) self.query(sql) def obfuscate_columns(self, target_schema: str, table_name: str): @@ -207,21 +255,22 @@ def obfuscate_columns(self, target_schema: str, table_name: str): # # We need to convert to the same format to find the transformation # has that has to be applied - tap_stream_name_by_table_name = '{}-{}'.format(table_dict['schema_name'], table_dict['table_name']) \ - if table_dict['schema_name'] is not None else table_dict['table_name'] + tap_stream_name_by_table_name = ( + '{}-{}'.format(table_dict['schema_name'], table_dict['table_name']) + if table_dict['schema_name'] is not None + else table_dict['table_name'] + ) # Find obfuscation rules for the current table # trans_map = self.__get_stream_transformation_map(tap_stream_name_by_table_name, transformations) trans_map = TransformationHelper.get_trans_in_sql_flavor( - tap_stream_name_by_table_name, - transformations, - SQLFlavor('bigquery')) + tap_stream_name_by_table_name, transformations, SQLFlavor('bigquery') + ) self.__apply_transformations(trans_map, target_schema, temp_table) LOGGER.info('Obfuscation rules applied.') - def swap_tables(self, schema, table_name): project_id = self.connection_config['project_id'] table_dict = utils.tablename_to_dict(table_name) @@ -243,7 +292,9 @@ def swap_tables(self, schema, table_name): # delete the temp table client.delete_table(temp_table_id) - def __apply_transformations(self, transformations: List[Dict], target_schema: str, table_name: str) -> None: + def __apply_transformations( + self, transformations: List[Dict], target_schema: str, table_name: str + ) -> None: """ Generate and execute the SQL queries based on the given transformations. Args: @@ -251,7 +302,9 @@ def __apply_transformations(self, transformations: List[Dict], target_schema: st target_schema: name of the target schema where the table lives table_name: the table name on which we want to apply the transformations """ - full_qual_table_name = '{}.{}'.format(safe_name(target_schema), safe_name(table_name)) + full_qual_table_name = '{}.{}'.format( + safe_name(target_schema), safe_name(table_name) + ) if transformations: all_cols_update_sql = '' @@ -263,8 +316,10 @@ def __apply_transformations(self, transformations: List[Dict], target_schema: st # If we have conditions, then we need to construct the query and execute it to transform the # single column conditionally if trans_item['conditions']: - sql = f'UPDATE {full_qual_table_name} ' \ - f'SET {trans_item["trans"]} WHERE {trans_item["conditions"]};' + sql = ( + f'UPDATE {full_qual_table_name} ' + f'SET {trans_item["trans"]} WHERE {trans_item["conditions"]};' + ) self.query(sql) @@ -276,7 +331,9 @@ def __apply_transformations(self, transformations: List[Dict], target_schema: st if not all_cols_update_sql: all_cols_update_sql = trans_item['trans'] else: - all_cols_update_sql = f'{all_cols_update_sql}, {trans_item["trans"]}' + all_cols_update_sql = ( + f'{all_cols_update_sql}, {trans_item["trans"]}' + ) # If we have some non-conditional transformations then construct and execute a query if all_cols_update_sql: diff --git a/pipelinewise/fastsync/commons/target_postgres.py b/pipelinewise/fastsync/commons/target_postgres.py index 0b7f201ae..c8b025b87 100644 --- a/pipelinewise/fastsync/commons/target_postgres.py +++ b/pipelinewise/fastsync/commons/target_postgres.py @@ -32,7 +32,7 @@ def open_connection(self): self.connection_config['dbname'], self.connection_config['user'], self.connection_config['password'], - self.connection_config['port'] + self.connection_config['port'], ) if 'ssl' in self.connection_config and self.connection_config['ssl'] == 'true': @@ -62,25 +62,48 @@ def create_schemas(self, tables): def drop_table(self, target_schema, table_name, is_temporary=False): table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) sql = 'DROP TABLE IF EXISTS {}."{}"'.format(target_schema, target_table.lower()) self.query(sql) - def create_table(self, target_schema: str, table_name: str, columns: List[str], primary_key: List[str], - is_temporary: bool = False, sort_columns=False): + def create_table( + self, + target_schema: str, + table_name: str, + columns: List[str], + primary_key: List[str], + is_temporary: bool = False, + sort_columns=False, + ): table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) # skip the EXTRACTED, BATCHED and DELETED columns in case they exist because they gonna be added later - columns = [c for c in columns if not (c.startswith(self.EXTRACTED_AT_COLUMN) or - c.startswith(self.BATCHED_AT_COLUMN) or - c.startswith(self.DELETED_AT_COLUMN))] - - columns += [f'{self.EXTRACTED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', - f'{self.BATCHED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', - f'{self.DELETED_AT_COLUMN} CHARACTER VARYING'] + columns = [ + c + for c in columns + if not ( + c.startswith(self.EXTRACTED_AT_COLUMN) + or c.startswith(self.BATCHED_AT_COLUMN) + or c.startswith(self.DELETED_AT_COLUMN) + ) + ] + + columns += [ + f'{self.EXTRACTED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', + f'{self.BATCHED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', + f'{self.DELETED_AT_COLUMN} CHARACTER VARYING', + ] # We need the sort the columns for some taps( for now tap-s3-csv) # because later on when copying a csv file into Snowflake @@ -90,17 +113,30 @@ def create_table(self, target_schema: str, table_name: str, columns: List[str], sql_columns = ','.join(columns).lower() sql_primary_keys = ','.join(primary_key).lower() if primary_key else None - sql = f'CREATE TABLE IF NOT EXISTS {target_schema}."{target_table.lower()}" (' \ - f'{sql_columns}' \ - f'{f", PRIMARY KEY ({sql_primary_keys}))" if primary_key else ")"}' + sql = ( + f'CREATE TABLE IF NOT EXISTS {target_schema}."{target_table.lower()}" (' + f'{sql_columns}' + f'{f", PRIMARY KEY ({sql_primary_keys}))" if primary_key else ")"}' + ) self.query(sql) - def copy_to_table(self, filepath, target_schema: str, table_name: str, size_bytes: int, - is_temporary: bool = False, skip_csv_header: bool = False): + def copy_to_table( + self, + filepath, + target_schema: str, + table_name: str, + size_bytes: int, + is_temporary: bool = False, + skip_csv_header: bool = False, + ): LOGGER.info('Loading %s into Postgres...', filepath) table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) with self.open_connection() as connection: with connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: @@ -114,21 +150,33 @@ def copy_to_table(self, filepath, target_schema: str, table_name: str, size_byte cur.copy_expert(copy_sql, file) inserts = cur.rowcount - LOGGER.info('Loading into %s."%s": %s', - target_schema, - target_table.lower(), - json.dumps({'inserts': inserts, 'updates': 0, 'size_bytes': size_bytes})) + LOGGER.info( + 'Loading into %s."%s": %s', + target_schema, + target_table.lower(), + json.dumps( + {'inserts': inserts, 'updates': 0, 'size_bytes': size_bytes} + ), + ) # grant_... functions are common functions called by utils.py: grant_privilege function # "to_group" is not used here but exists for compatibility reasons with other database types # "to_group" is for databases that can grant to users and groups separately like Amazon Redshift # pylint: disable=unused-argument - def grant_select_on_table(self, target_schema, table_name, role, is_temporary, to_group=False): + def grant_select_on_table( + self, target_schema, table_name, role, is_temporary, to_group=False + ): # Grant role is not mandatory parameter, do nothing if not specified if role: table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') - sql = 'GRANT SELECT ON {}."{}" TO GROUP {}'.format(target_schema, target_table.lower(), role) + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) + sql = 'GRANT SELECT ON {}."{}" TO GROUP {}'.format( + target_schema, target_table.lower(), role + ) self.query(sql) # pylint: disable=unused-argument @@ -142,10 +190,14 @@ def grant_usage_on_schema(self, target_schema, role, to_group=False): def grant_select_on_schema(self, target_schema, role, to_group=False): # Grant role is not mandatory parameter, do nothing if not specified if role: - sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO GROUP {}'.format(target_schema, role) + sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO GROUP {}'.format( + target_schema, role + ) self.query(sql) - def obfuscate_columns(self, target_schema: str, table_name: str, is_temporary: bool = False): + def obfuscate_columns( + self, target_schema: str, table_name: str, is_temporary: bool = False + ): """ Apply any configured transformations to the given table Args: @@ -155,7 +207,11 @@ def obfuscate_columns(self, target_schema: str, table_name: str, is_temporary: b LOGGER.info('Starting obfuscation rules...') table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) transformations = self.transformation_config.get('transformations', []) # Input table_name is formatted as {{schema}}.{{table}} @@ -163,12 +219,13 @@ def obfuscate_columns(self, target_schema: str, table_name: str, is_temporary: b # # We need to convert to the same format to find the transformation # has that has to be applied - tap_stream_name_by_table_name = '{}-{}'.format(table_dict.get('schema_name'), table_dict.get('table_name')) + tap_stream_name_by_table_name = '{}-{}'.format( + table_dict.get('schema_name'), table_dict.get('table_name') + ) trans_cols = TransformationHelper.get_trans_in_sql_flavor( - tap_stream_name_by_table_name, - transformations, - SQLFlavor('postgres')) + tap_stream_name_by_table_name, transformations, SQLFlavor('postgres') + ) self.__apply_transformations(trans_cols, target_schema, target_table) @@ -181,7 +238,11 @@ def swap_tables(self, schema, table_name): # Swap tables and drop the temp tamp self.query('DROP TABLE IF EXISTS {}."{}"'.format(schema, target_table.lower())) - self.query('ALTER TABLE {}."{}" RENAME TO "{}"'.format(schema, temp_table.lower(), target_table.lower())) + self.query( + 'ALTER TABLE {}."{}" RENAME TO "{}"'.format( + schema, temp_table.lower(), target_table.lower() + ) + ) def __apply_transformations(self, transformations, target_schema, table_name): """ @@ -203,8 +264,10 @@ def __apply_transformations(self, transformations, target_schema, table_name): # If we have conditions, then we need to construct the query and execute it to transform the # single column conditionally if trans_item['conditions']: - sql = f'UPDATE {full_qual_table_name} ' \ - f'SET {trans_item["trans"]} WHERE {trans_item["conditions"]};' + sql = ( + f'UPDATE {full_qual_table_name} ' + f'SET {trans_item["trans"]} WHERE {trans_item["conditions"]};' + ) self.query(sql) @@ -216,10 +279,14 @@ def __apply_transformations(self, transformations, target_schema, table_name): if not all_cols_update_sql: all_cols_update_sql = trans_item['trans'] else: - all_cols_update_sql = f'{all_cols_update_sql}, {trans_item["trans"]}' + all_cols_update_sql = ( + f'{all_cols_update_sql}, {trans_item["trans"]}' + ) # If we have some non-conditional transformations then construct and execute a query if all_cols_update_sql: - all_cols_update_sql = f'UPDATE {full_qual_table_name} SET {all_cols_update_sql};' + all_cols_update_sql = ( + f'UPDATE {full_qual_table_name} SET {all_cols_update_sql};' + ) self.query(all_cols_update_sql) diff --git a/pipelinewise/fastsync/commons/target_redshift.py b/pipelinewise/fastsync/commons/target_redshift.py index ff71fb58a..f4190a41b 100644 --- a/pipelinewise/fastsync/commons/target_redshift.py +++ b/pipelinewise/fastsync/commons/target_redshift.py @@ -28,11 +28,18 @@ def __init__(self, connection_config, transformation_config=None): self.transformation_config = transformation_config # Get the required parameters from config file and/or environment variables - aws_profile = self.connection_config.get('aws_profile') or os.environ.get('AWS_PROFILE') - aws_access_key_id = self.connection_config.get('aws_access_key_id') or os.environ.get('AWS_ACCESS_KEY_ID') - aws_secret_access_key = self.connection_config.get('aws_secret_access_key') or \ - os.environ.get('AWS_SECRET_ACCESS_KEY') - aws_session_token = self.connection_config.get('aws_session_token') or os.environ.get('AWS_SESSION_TOKEN') + aws_profile = self.connection_config.get('aws_profile') or os.environ.get( + 'AWS_PROFILE' + ) + aws_access_key_id = self.connection_config.get( + 'aws_access_key_id' + ) or os.environ.get('AWS_ACCESS_KEY_ID') + aws_secret_access_key = self.connection_config.get( + 'aws_secret_access_key' + ) or os.environ.get('AWS_SECRET_ACCESS_KEY') + aws_session_token = self.connection_config.get( + 'aws_session_token' + ) or os.environ.get('AWS_SESSION_TOKEN') # Init S3 client # Conditionally pass keys as this seems to affect whether instance credentials @@ -41,7 +48,7 @@ def __init__(self, connection_config, transformation_config=None): aws_session = boto3.session.Session( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, - aws_session_token=aws_session_token + aws_session_token=aws_session_token, ) credentials = aws_session.get_credentials().get_frozen_credentials() @@ -60,7 +67,7 @@ def open_connection(self): self.connection_config['dbname'], self.connection_config['user'], self.connection_config['password'], - self.connection_config['port'] + self.connection_config['port'], ) return psycopg2.connect(conn_string) @@ -84,7 +91,12 @@ def upload_to_s3(self, file): extra_args = {'ACL': s3_acl} if s3_acl else None - LOGGER.info('Uploading to S3 bucket: %s, local file: %s, S3 key: %s', bucket, file, s3_key) + LOGGER.info( + 'Uploading to S3 bucket: %s, local file: %s, S3 key: %s', + bucket, + file, + s3_key, + ) self.s3.upload_file(file, bucket, s3_key, ExtraArgs=extra_args) @@ -101,25 +113,48 @@ def create_schemas(self, tables): def drop_table(self, target_schema, table_name, is_temporary=False): table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) sql = 'DROP TABLE IF EXISTS {}."{}"'.format(target_schema, target_table.upper()) self.query(sql) - def create_table(self, target_schema: str, table_name: str, columns: List[str], primary_key: List[str], - is_temporary: bool = False, sort_columns=False): + def create_table( + self, + target_schema: str, + table_name: str, + columns: List[str], + primary_key: List[str], + is_temporary: bool = False, + sort_columns=False, + ): table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) # skip the EXTRACTED, BATCHED and DELETED columns in case they exist because they gonna be added later - columns = [c for c in columns if not (c.startswith(self.EXTRACTED_AT_COLUMN) or - c.startswith(self.BATCHED_AT_COLUMN) or - c.startswith(self.DELETED_AT_COLUMN))] + columns = [ + c + for c in columns + if not ( + c.startswith(self.EXTRACTED_AT_COLUMN) + or c.startswith(self.BATCHED_AT_COLUMN) + or c.startswith(self.DELETED_AT_COLUMN) + ) + ] - columns += [f'{self.EXTRACTED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', - f'{self.BATCHED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', - f'{self.DELETED_AT_COLUMN} CHARACTER VARYING'] + columns += [ + f'{self.EXTRACTED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', + f'{self.BATCHED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', + f'{self.DELETED_AT_COLUMN} CHARACTER VARYING', + ] # We need the sort the columns for some taps( for now tap-s3-csv) # because later on when copying a csv file into Snowflake @@ -129,80 +164,121 @@ def create_table(self, target_schema: str, table_name: str, columns: List[str], sql_columns = ','.join(columns) sql_primary_keys = ','.join(primary_key) if primary_key else None - sql = f'CREATE TABLE IF NOT EXISTS {target_schema}."{target_table.upper()}" (' \ - f'{sql_columns}' \ - f'{f", PRIMARY KEY ({sql_primary_keys}))" if primary_key else ")"}' + sql = ( + f'CREATE TABLE IF NOT EXISTS {target_schema}."{target_table.upper()}" (' + f'{sql_columns}' + f'{f", PRIMARY KEY ({sql_primary_keys}))" if primary_key else ")"}' + ) self.query(sql) - def copy_to_table(self, s3_key, target_schema, table_name, size_bytes, is_temporary, skip_csv_header=False): + def copy_to_table( + self, + s3_key, + target_schema, + table_name, + size_bytes, + is_temporary, + skip_csv_header=False, + ): LOGGER.info('Loading %s into Redshift...', s3_key) table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) inserts = 0 bucket = self.connection_config['s3_bucket'] # Step 1: Generate copy credentials - prefer role if provided, otherwise use access and secret keys - copy_credentials = """ + copy_credentials = ( + """ iam_role '{aws_role_arn}' - """.format(aws_role_arn=self.connection_config['aws_redshift_copy_role_arn']) \ - if self.connection_config.get('aws_redshift_copy_role_arn') else """ + """.format( + aws_role_arn=self.connection_config['aws_redshift_copy_role_arn'] + ) + if self.connection_config.get('aws_redshift_copy_role_arn') + else """ ACCESS_KEY_ID '{aws_access_key_id}' SECRET_ACCESS_KEY '{aws_secret_access_key}' {aws_session_token} """.format( - aws_access_key_id=self.connection_config['aws_access_key_id'], - aws_secret_access_key=self.connection_config['aws_secret_access_key'], - aws_session_token="SESSION_TOKEN '{}'".format(self.connection_config['aws_session_token']) \ - if self.connection_config.get('aws_session_token') else '', + aws_access_key_id=self.connection_config['aws_access_key_id'], + aws_secret_access_key=self.connection_config['aws_secret_access_key'], + aws_session_token="SESSION_TOKEN '{}'".format( + self.connection_config['aws_session_token'] + ) + if self.connection_config.get('aws_session_token') + else '', + ) ) # Step 2: Generate copy options - Override defaults from config.json if defined - copy_options = self.connection_config.get('copy_options', f""" + copy_options = self.connection_config.get( + 'copy_options', + f""" EMPTYASNULL BLANKSASNULL TRIMBLANKS TRUNCATECOLUMNS IGNOREHEADER {int(skip_csv_header)} TIMEFORMAT 'auto' - """) + """, + ) # Step3: Using the built-in CSV COPY option to load - copy_sql = f'COPY {target_schema}."{target_table.upper()}" FROM \'s3://{bucket}/{s3_key}\'' \ - f'{copy_credentials}' \ - f'{copy_options}' \ - f'CSV GZIP' + copy_sql = ( + f'COPY {target_schema}."{target_table.upper()}" FROM \'s3://{bucket}/{s3_key}\'' + f'{copy_credentials}' + f'{copy_options}' + f'CSV GZIP' + ) # Get number of inserted records - COPY does insert only results = self.query(copy_sql) if len(results) > 0: inserts = results[0].get('rows_loaded', 0) - LOGGER.info('Loading into %s."%s": %s', - target_schema, - target_table.upper(), - json.dumps({'inserts': inserts, 'updates': 0, 'size_bytes': size_bytes})) + LOGGER.info( + 'Loading into %s."%s": %s', + target_schema, + target_table.upper(), + json.dumps({'inserts': inserts, 'updates': 0, 'size_bytes': size_bytes}), + ) LOGGER.info('Deleting %s from S3...', s3_key) self.s3.delete_object(Bucket=bucket, Key=s3_key) - def grant_select_on_table(self, target_schema, table_name, grantee, is_temporary, to_group=False): + def grant_select_on_table( + self, target_schema, table_name, grantee, is_temporary, to_group=False + ): # Grant role is not mandatory parameter, do nothing if not specified if grantee: table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') - sql = 'GRANT SELECT ON {}."{}" TO {} {}'.format(target_schema, - target_table.upper(), 'GROUP' if to_group else '', - grantee) + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) + sql = 'GRANT SELECT ON {}."{}" TO {} {}'.format( + target_schema, + target_table.upper(), + 'GROUP' if to_group else '', + grantee, + ) self.query(sql) def grant_usage_on_schema(self, target_schema, grantee, to_group=False): # Grant role is not mandatory parameter, do nothing if not specified if grantee: - sql = 'GRANT USAGE ON SCHEMA {} TO {} {}'.format(target_schema, 'GROUP' if to_group else '', grantee) + sql = 'GRANT USAGE ON SCHEMA {} TO {} {}'.format( + target_schema, 'GROUP' if to_group else '', grantee + ) self.query(sql) def grant_select_on_schema(self, target_schema, grantee, to_group=False): # Grant role is not mandatory parameter, do nothing if not specified if grantee: - sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO {} {}'.format(target_schema, 'GROUP' if to_group else '', - grantee) + sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO {} {}'.format( + target_schema, 'GROUP' if to_group else '', grantee + ) self.query(sql) # pylint: disable=duplicate-string-formatting-argument @@ -220,7 +296,9 @@ def obfuscate_columns(self, target_schema, table_name): # # We need to convert to the same format to find the transformation # has that has to be applied - tap_stream_name_by_table_name = '{}-{}'.format(table_dict.get('schema_name'), table_dict.get('table_name')) + tap_stream_name_by_table_name = '{}-{}'.format( + table_dict.get('schema_name'), table_dict.get('table_name') + ) if trans.get('tap_stream_name') == tap_stream_name_by_table_name: column = trans.get('field_id') transform_type = trans.get('type') @@ -230,10 +308,17 @@ def obfuscate_columns(self, target_schema, table_name): trans_cols.append('"{}" = FUNC_SHA1("{}")'.format(column, column)) elif 'HASH-SKIP-FIRST' in transform_type: skip_first_n = transform_type[-1] - trans_cols.append('"{}" = CONCAT(SUBSTRING("{}", 1, {}), FUNC_SHA1(SUBSTRING("{}", {} + 1)))' - .format(column, column, skip_first_n, column, skip_first_n)) + trans_cols.append( + '"{}" = CONCAT(SUBSTRING("{}", 1, {}), FUNC_SHA1(SUBSTRING("{}", {} + 1)))'.format( + column, column, skip_first_n, column, skip_first_n + ) + ) elif transform_type == 'MASK-DATE': - trans_cols.append('"{}" = TO_CHAR("{}"::DATE, \'YYYY-01-01\')::DATE'.format(column, column)) + trans_cols.append( + '"{}" = TO_CHAR("{}"::DATE, \'YYYY-01-01\')::DATE'.format( + column, column + ) + ) elif transform_type == 'MASK-NUMBER': trans_cols.append('"{}" = 0'.format(column)) @@ -252,4 +337,8 @@ def swap_tables(self, schema, table_name): # Swap tables and drop the temp tamp self.query('DROP TABLE IF EXISTS {}."{}"'.format(schema, target_table.upper())) - self.query('ALTER TABLE {}."{}" RENAME TO "{}"'.format(schema, temp_table.upper(), target_table.upper())) + self.query( + 'ALTER TABLE {}."{}" RENAME TO "{}"'.format( + schema, temp_table.upper(), target_table.upper() + ) + ) diff --git a/pipelinewise/fastsync/commons/target_snowflake.py b/pipelinewise/fastsync/commons/target_snowflake.py index 685490b46..d1f495087 100644 --- a/pipelinewise/fastsync/commons/target_snowflake.py +++ b/pipelinewise/fastsync/commons/target_snowflake.py @@ -6,8 +6,7 @@ from typing import List, Dict from snowflake.connector.encryption_util import SnowflakeEncryptionUtil -from snowflake.connector.remote_storage_util import \ - SnowflakeFileEncryptionMaterial +from snowflake.connector.remote_storage_util import SnowflakeFileEncryptionMaterial from . import utils from .transform_utils import TransformationHelper, SQLFlavor @@ -30,27 +29,36 @@ def __init__(self, connection_config, transformation_config=None): self.transformation_config = transformation_config # Get the required parameters from config file and/or environment variables - aws_profile = self.connection_config.get('aws_profile') or os.environ.get('AWS_PROFILE') - aws_access_key_id = self.connection_config.get('aws_access_key_id') or os.environ.get('AWS_ACCESS_KEY_ID') - aws_secret_access_key = self.connection_config.get('aws_secret_access_key') or \ - os.environ.get('AWS_SECRET_ACCESS_KEY') - aws_session_token = self.connection_config.get('aws_session_token') or os.environ.get('AWS_SESSION_TOKEN') + aws_profile = self.connection_config.get('aws_profile') or os.environ.get( + 'AWS_PROFILE' + ) + aws_access_key_id = self.connection_config.get( + 'aws_access_key_id' + ) or os.environ.get('AWS_ACCESS_KEY_ID') + aws_secret_access_key = self.connection_config.get( + 'aws_secret_access_key' + ) or os.environ.get('AWS_SECRET_ACCESS_KEY') + aws_session_token = self.connection_config.get( + 'aws_session_token' + ) or os.environ.get('AWS_SESSION_TOKEN') # AWS credentials based authentication if aws_access_key_id and aws_secret_access_key: aws_session = boto3.session.Session( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, - aws_session_token=aws_session_token + aws_session_token=aws_session_token, ) # AWS Profile based authentication else: aws_session = boto3.session.Session(profile_name=aws_profile) # Create the s3 client - self.s3 = aws_session.client('s3', - region_name=self.connection_config.get('s3_region_name'), - endpoint_url=self.connection_config.get('s3_endpoint_url')) + self.s3 = aws_session.client( + 's3', + region_name=self.connection_config.get('s3_region_name'), + endpoint_url=self.connection_config.get('s3_endpoint_url'), + ) def create_query_tag(self, query_tag_props: dict = None) -> str: schema = None @@ -60,24 +68,30 @@ def create_query_tag(self, query_tag_props: dict = None) -> str: schema = query_tag_props.get('schema') table = query_tag_props.get('table') - return json.dumps({'ppw_component': 'fastsync', - 'tap_id': self.connection_config.get('tap_id'), - 'database': self.connection_config['dbname'], - 'schema': schema, - 'table': table}) + return json.dumps( + { + 'ppw_component': 'fastsync', + 'tap_id': self.connection_config.get('tap_id'), + 'database': self.connection_config['dbname'], + 'schema': schema, + 'table': table, + } + ) def open_connection(self, query_tag_props=None): - return snowflake.connector.connect(user=self.connection_config['user'], - password=self.connection_config['password'], - account=self.connection_config['account'], - database=self.connection_config['dbname'], - warehouse=self.connection_config['warehouse'], - autocommit=True, - session_parameters={ - # Quoted identifiers should be case sensitive - 'QUOTED_IDENTIFIERS_IGNORE_CASE': 'FALSE', - 'QUERY_TAG': self.create_query_tag(query_tag_props) - }) + return snowflake.connector.connect( + user=self.connection_config['user'], + password=self.connection_config['password'], + account=self.connection_config['account'], + database=self.connection_config['dbname'], + warehouse=self.connection_config['warehouse'], + autocommit=True, + session_parameters={ + # Quoted identifiers should be case sensitive + 'QUOTED_IDENTIFIERS_IGNORE_CASE': 'FALSE', + 'QUERY_TAG': self.create_query_tag(query_tag_props), + }, + ) def query(self, query, params=None, query_tag_props=None): LOGGER.debug('Running query: %s', query) @@ -96,7 +110,12 @@ def upload_to_s3(self, file, tmp_dir=None): s3_key_prefix = self.connection_config.get('s3_key_prefix', '') s3_key = '{}{}'.format(s3_key_prefix, os.path.basename(file)) - LOGGER.info('Uploading to S3 bucket: %s, local file: %s, S3 key: %s', bucket, file, s3_key) + LOGGER.info( + 'Uploading to S3 bucket: %s, local file: %s, S3 key: %s', + bucket, + file, + s3_key, + ) # Encrypt csv if client side encryption enabled master_key = self.connection_config.get('client_side_encryption_master_key', '') @@ -104,14 +123,10 @@ def upload_to_s3(self, file, tmp_dir=None): # Encrypt the file LOGGER.info('Encrypting file %s...', file) encryption_material = SnowflakeFileEncryptionMaterial( - query_stage_master_key=master_key, - query_id='', - smk_id=0 + query_stage_master_key=master_key, query_id='', smk_id=0 ) encryption_metadata, encrypted_file = SnowflakeEncryptionUtil.encrypt_file( - encryption_material, - file, - tmp_dir=tmp_dir + encryption_material, file, tmp_dir=tmp_dir ) # Upload to s3 @@ -120,7 +135,7 @@ def upload_to_s3(self, file, tmp_dir=None): # Send key and iv in the metadata, that will be required to decrypt and upload the encrypted file extra_args['Metadata'] = { 'x-amz-key': encryption_metadata.key, - 'x-amz-iv': encryption_metadata.iv + 'x-amz-iv': encryption_metadata.iv, } self.s3.upload_file(encrypted_file, bucket, s3_key, ExtraArgs=extra_args) @@ -144,28 +159,43 @@ def copy_to_archive(self, source_s3_key, tap_id, table): archive_file_basename = os.path.basename(source_s3_key) # Get archive s3 prefix from config, defaulting to 'archive' if not specified - archive_s3_prefix = self.connection_config.get('archive_load_files_s3_prefix', 'archive') + archive_s3_prefix = self.connection_config.get( + 'archive_load_files_s3_prefix', 'archive' + ) source_s3_bucket = self.connection_config.get('s3_bucket') # Combine existing metadata with archive related headers - metadata = self.s3.head_object(Bucket=source_s3_bucket, Key=source_s3_key).get('Metadata', {}) - metadata.update({ - 'tap': tap_id, - 'schema': archive_schema, - 'table': archive_table, - 'archived-by': 'pipelinewise_fastsync_postgres_to_snowflake' - }) + metadata = self.s3.head_object(Bucket=source_s3_bucket, Key=source_s3_key).get( + 'Metadata', {} + ) + metadata.update( + { + 'tap': tap_id, + 'schema': archive_schema, + 'table': archive_table, + 'archived-by': 'pipelinewise_fastsync_postgres_to_snowflake', + } + ) # Get archive s3 bucket from config, defaulting to same bucket used for Snowflake imports if not specified - archive_s3_bucket = self.connection_config.get('archive_load_files_s3_bucket', source_s3_bucket) + archive_s3_bucket = self.connection_config.get( + 'archive_load_files_s3_bucket', source_s3_bucket + ) - archive_key = '{}/{}/{}/{}'.format(archive_s3_prefix, tap_id, archive_table, archive_file_basename) + archive_key = '{}/{}/{}/{}'.format( + archive_s3_prefix, tap_id, archive_table, archive_file_basename + ) copy_source = '{}/{}'.format(source_s3_bucket, source_s3_key) LOGGER.info('Archiving %s to %s', copy_source, archive_key) - self.s3.copy_object(CopySource=copy_source, Bucket=archive_s3_bucket, Key=archive_key, - Metadata=metadata, MetadataDirective='REPLACE') + self.s3.copy_object( + CopySource=copy_source, + Bucket=archive_s3_bucket, + Key=archive_key, + Metadata=metadata, + MetadataDirective='REPLACE', + ) def create_schema(self, schema): sql = 'CREATE SCHEMA IF NOT EXISTS {}'.format(schema) @@ -173,25 +203,48 @@ def create_schema(self, schema): def drop_table(self, target_schema, table_name, is_temporary=False): table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) sql = 'DROP TABLE IF EXISTS {}."{}"'.format(target_schema, target_table.upper()) self.query(sql, query_tag_props={'schema': target_schema, 'table': table_name}) - def create_table(self, target_schema: str, table_name: str, columns: List[str], primary_key: List[str], - is_temporary: bool = False, sort_columns=False): + def create_table( + self, + target_schema: str, + table_name: str, + columns: List[str], + primary_key: List[str], + is_temporary: bool = False, + sort_columns=False, + ): table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) # skip the EXTRACTED, BATCHED and DELETED columns in case they exist because they gonna be added later - columns = [c for c in columns if not (c.startswith(utils.SDC_EXTRACTED_AT) or - c.startswith(utils.SDC_BATCHED_AT) or - c.startswith(utils.SDC_DELETED_AT))] + columns = [ + c + for c in columns + if not ( + c.startswith(utils.SDC_EXTRACTED_AT) + or c.startswith(utils.SDC_BATCHED_AT) + or c.startswith(utils.SDC_DELETED_AT) + ) + ] - columns += [f'{utils.SDC_EXTRACTED_AT} TIMESTAMP_NTZ', - f'{utils.SDC_BATCHED_AT} TIMESTAMP_NTZ', - f'{utils.SDC_DELETED_AT} VARCHAR'] + columns += [ + f'{utils.SDC_EXTRACTED_AT} TIMESTAMP_NTZ', + f'{utils.SDC_BATCHED_AT} TIMESTAMP_NTZ', + f'{utils.SDC_DELETED_AT} VARCHAR', + ] # We need the sort the columns for some taps( for now tap-s3-csv) # because later on when copying a csv file into Snowflake @@ -201,49 +254,85 @@ def create_table(self, target_schema: str, table_name: str, columns: List[str], sql_columns = ','.join(columns) sql_primary_keys = ','.join(primary_key) if primary_key else None - sql = f'CREATE OR REPLACE TABLE {target_schema}."{target_table.upper()}" (' \ - f'{sql_columns}' \ - f'{f", PRIMARY KEY ({sql_primary_keys}))" if primary_key else ")"}' + sql = ( + f'CREATE OR REPLACE TABLE {target_schema}."{target_table.upper()}" (' + f'{sql_columns}' + f'{f", PRIMARY KEY ({sql_primary_keys}))" if primary_key else ")"}' + ) - self.query(sql, query_tag_props={'schema': target_schema, 'table': target_table}) + self.query( + sql, query_tag_props={'schema': target_schema, 'table': target_table} + ) # pylint: disable=too-many-locals - def copy_to_table(self, s3_key, target_schema, table_name, size_bytes, is_temporary, skip_csv_header=False): + def copy_to_table( + self, + s3_key, + target_schema, + table_name, + size_bytes, + is_temporary, + skip_csv_header=False, + ): LOGGER.info('Loading %s into Snowflake...', s3_key) table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) inserts = 0 stage = self.connection_config['stage'] - sql = f'COPY INTO {target_schema}."{target_table.upper()}" FROM \'@{stage}/{s3_key}\'' \ - f' FILE_FORMAT = (type=CSV escape=\'\\x1e\' escape_unenclosed_field=\'\\x1e\'' \ - f' field_optionally_enclosed_by=\'\"\' skip_header={int(skip_csv_header)}' \ - f' compression=GZIP binary_format=HEX)' + sql = ( + f'COPY INTO {target_schema}."{target_table.upper()}" FROM \'@{stage}/{s3_key}\'' + f' FILE_FORMAT = (type=CSV escape=\'\\x1e\' escape_unenclosed_field=\'\\x1e\'' + f' field_optionally_enclosed_by=\'\"\' skip_header={int(skip_csv_header)}' + f' compression=GZIP binary_format=HEX)' + ) # Get number of inserted records - COPY does insert only - results = self.query(sql, query_tag_props={'schema': target_schema, 'table': target_table}) + results = self.query( + sql, query_tag_props={'schema': target_schema, 'table': target_table} + ) if len(results) > 0: inserts = sum([file_part.get('rows_loaded', 0) for file_part in results]) - LOGGER.info('Loading into %s."%s": %s', - target_schema, - target_table.upper(), - json.dumps({'inserts': inserts, - 'updates': 0, - 'file_parts': len(results), - 'size_bytes': size_bytes})) + LOGGER.info( + 'Loading into %s."%s": %s', + target_schema, + target_table.upper(), + json.dumps( + { + 'inserts': inserts, + 'updates': 0, + 'file_parts': len(results), + 'size_bytes': size_bytes, + } + ), + ) # grant_... functions are common functions called by utils.py: grant_privilege function # "to_group" is not used here but exists for compatibility reasons with other database types # "to_group" is for databases that can grant to users and groups separately like Amazon Redshift # pylint: disable=unused-argument - def grant_select_on_table(self, target_schema, table_name, role, is_temporary, to_group=False): + def grant_select_on_table( + self, target_schema, table_name, role, is_temporary, to_group=False + ): # Grant role is not mandatory parameter, do nothing if not specified if role: table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') - sql = 'GRANT SELECT ON {}."{}" TO ROLE {}'.format(target_schema, target_table.upper(), role) - self.query(sql, query_tag_props={'schema': target_schema, 'table': table_name}) + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) + sql = 'GRANT SELECT ON {}."{}" TO ROLE {}'.format( + target_schema, target_table.upper(), role + ) + self.query( + sql, query_tag_props={'schema': target_schema, 'table': table_name} + ) # pylint: disable=unused-argument def grant_usage_on_schema(self, target_schema, role, to_group=False): @@ -256,7 +345,9 @@ def grant_usage_on_schema(self, target_schema, role, to_group=False): def grant_select_on_schema(self, target_schema, role, to_group=False): # Grant role is not mandatory parameter, do nothing if not specified if role: - sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO ROLE {}'.format(target_schema, role) + sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO ROLE {}'.format( + target_schema, role + ) self.query(sql, query_tag_props={'schema': target_schema}) def obfuscate_columns(self, target_schema: str, table_name: str): @@ -277,15 +368,17 @@ def obfuscate_columns(self, target_schema: str, table_name: str): # # We need to convert to the same format to find the transformation # has that has to be applied - tap_stream_name_by_table_name = '{}-{}'.format(table_dict['schema_name'], table_dict['table_name']) \ - if table_dict['schema_name'] is not None else table_dict['table_name'] + tap_stream_name_by_table_name = ( + '{}-{}'.format(table_dict['schema_name'], table_dict['table_name']) + if table_dict['schema_name'] is not None + else table_dict['table_name'] + ) # Find obfuscation rules for the current table # trans_map = self.__get_stream_transformation_map(tap_stream_name_by_table_name, transformations) trans_map = TransformationHelper.get_trans_in_sql_flavor( - tap_stream_name_by_table_name, - transformations, - SQLFlavor('snowflake')) + tap_stream_name_by_table_name, transformations, SQLFlavor('snowflake') + ) self.__apply_transformations(trans_map, target_schema, temp_table) @@ -304,13 +397,19 @@ def swap_tables(self, schema, table_name) -> None: temp_table = table_dict.get('temp_table_name') # Swap tables and drop the temp tamp - self.query(f'ALTER TABLE {schema}."{temp_table.upper()}" SWAP WITH {schema}."{target_table.upper()}"', - query_tag_props={'schema': schema, 'table': target_table}) - - self.query(f'DROP TABLE IF EXISTS {schema}."{temp_table.upper()}"', - query_tag_props={'schema': schema, 'table': temp_table}) - - def __apply_transformations(self, transformations: List[Dict], target_schema: str, table_name: str) -> None: + self.query( + f'ALTER TABLE {schema}."{temp_table.upper()}" SWAP WITH {schema}."{target_table.upper()}"', + query_tag_props={'schema': schema, 'table': target_table}, + ) + + self.query( + f'DROP TABLE IF EXISTS {schema}."{temp_table.upper()}"', + query_tag_props={'schema': schema, 'table': temp_table}, + ) + + def __apply_transformations( + self, transformations: List[Dict], target_schema: str, table_name: str + ) -> None: """ Generate and execute the SQL queries based on the given transformations. Args: @@ -330,10 +429,15 @@ def __apply_transformations(self, transformations: List[Dict], target_schema: st # If we have conditions, then we need to construct the query and execute it to transform the # single column conditionally if trans_item['conditions']: - sql = f'UPDATE {full_qual_table_name} ' \ - f'SET {trans_item["trans"]} WHERE {trans_item["conditions"]};' + sql = ( + f'UPDATE {full_qual_table_name} ' + f'SET {trans_item["trans"]} WHERE {trans_item["conditions"]};' + ) - self.query(sql, query_tag_props={'schema': target_schema, 'table': table_name}) + self.query( + sql, + query_tag_props={'schema': target_schema, 'table': table_name}, + ) # Otherwise, we can add this column to a general UPDATE query with no predicates else: @@ -343,10 +447,17 @@ def __apply_transformations(self, transformations: List[Dict], target_schema: st if not all_cols_update_sql: all_cols_update_sql = trans_item['trans'] else: - all_cols_update_sql = f'{all_cols_update_sql}, {trans_item["trans"]}' + all_cols_update_sql = ( + f'{all_cols_update_sql}, {trans_item["trans"]}' + ) # If we have some non-conditional transformations then construct and execute a query if all_cols_update_sql: - all_cols_update_sql = f'UPDATE {full_qual_table_name} SET {all_cols_update_sql};' - - self.query(all_cols_update_sql, query_tag_props={'schema': target_schema, 'table': table_name}) + all_cols_update_sql = ( + f'UPDATE {full_qual_table_name} SET {all_cols_update_sql};' + ) + + self.query( + all_cols_update_sql, + query_tag_props={'schema': target_schema, 'table': table_name}, + ) diff --git a/pipelinewise/fastsync/commons/transform_utils.py b/pipelinewise/fastsync/commons/transform_utils.py index 6dc9a300c..ef831f8a5 100644 --- a/pipelinewise/fastsync/commons/transform_utils.py +++ b/pipelinewise/fastsync/commons/transform_utils.py @@ -7,6 +7,7 @@ class TransformationType(Enum): """ List of supported transformation types """ + SET_NULL = 'SET-NULL' MASK_HIDDEN = 'MASK-HIDDEN' MASK_DATE = 'MASK-DATE' @@ -28,6 +29,7 @@ class SQLFlavor(Enum): """ List of supported sql flavors """ + SNOWFLAKE = 'snowflake' POSTGRES = 'postgres' BIGQUERY = 'bigquery' @@ -41,10 +43,8 @@ class TransformationHelper: @classmethod def get_trans_in_sql_flavor( - cls, - stream_name: str, - transformations: List[Dict], - sql_flavor: SQLFlavor) -> List[Dict]: + cls, stream_name: str, transformations: List[Dict], sql_flavor: SQLFlavor + ) -> List[Dict]: """ Find the transformations to apply to the given stream and does proper formatting and mapping @@ -78,52 +78,56 @@ def get_trans_in_sql_flavor( conditions = cls.__conditions_to_sql(transform_conditions, sql_flavor) if transform_type == TransformationType.SET_NULL: - trans_map.append({ - 'trans': f'{column} = NULL', - 'conditions': conditions - }) + trans_map.append( + {'trans': f'{column} = NULL', 'conditions': conditions} + ) elif transform_type == TransformationType.HASH: - trans_map.append({ - 'trans': cls.__hash_to_sql(column, sql_flavor), - 'conditions': conditions - }) + trans_map.append( + { + 'trans': cls.__hash_to_sql(column, sql_flavor), + 'conditions': conditions, + } + ) elif transform_type.value.startswith('HASH-SKIP-FIRST-'): - trans_map.append({ - 'trans': cls.__hash_skip_first_to_sql(transform_type, column, sql_flavor), - 'conditions': conditions - }) + trans_map.append( + { + 'trans': cls.__hash_skip_first_to_sql( + transform_type, column, sql_flavor + ), + 'conditions': conditions, + } + ) elif transform_type == TransformationType.MASK_DATE: - trans_map.append({ - 'trans': cls.__mask_date_to_sql(column, sql_flavor), - 'conditions': conditions - }) + trans_map.append( + { + 'trans': cls.__mask_date_to_sql(column, sql_flavor), + 'conditions': conditions, + } + ) elif transform_type == TransformationType.MASK_NUMBER: - trans_map.append({ - 'trans': f'{column} = 0', - 'conditions': conditions - }) + trans_map.append( + {'trans': f'{column} = 0', 'conditions': conditions} + ) elif transform_type == TransformationType.MASK_HIDDEN: - trans_map.append({ - 'trans': f"{column} = 'hidden'", - 'conditions': conditions - }) + trans_map.append( + {'trans': f"{column} = 'hidden'", 'conditions': conditions} + ) return trans_map @classmethod # pylint: disable=W0238 # False positive when it is used by another classmethod def __conditions_to_sql( - cls, - transform_conditions: List[Dict], - sql_flavor: SQLFlavor) -> Optional[str]: + cls, transform_conditions: List[Dict], sql_flavor: SQLFlavor + ) -> Optional[str]: """ Convert the conditional transformations into equivalent form in SF SQL. Args: @@ -160,7 +164,11 @@ def __conditions_to_sql( else: operator = '=' - value = f"'{condition['equals']}'" if isinstance(condition['equals'], str) else condition['equals'] + value = ( + f"'{condition['equals']}'" + if isinstance(condition['equals'], str) + else condition['equals'] + ) elif 'regex_match' in condition: @@ -173,17 +181,23 @@ def __conditions_to_sql( operator = '~' elif sql_flavor == SQLFlavor.BIGQUERY: - conditions.append(f"REGEXP_CONTAINS({cls.__safe_column(condition['column'], sql_flavor)}, {value})") + conditions.append( + f"REGEXP_CONTAINS({cls.__safe_column(condition['column'], sql_flavor)}, {value})" + ) continue else: - raise NotImplementedError(f'regex_match conditional transformation in {sql_flavor.value} SQL ' - f'flavor not implemented!') + raise NotImplementedError( + f'regex_match conditional transformation in {sql_flavor.value} SQL ' + f'flavor not implemented!' + ) else: continue - conditions.append(f"({cls.__safe_column(condition['column'], sql_flavor)} {operator} {value})") + conditions.append( + f"({cls.__safe_column(condition['column'], sql_flavor)} {operator} {value})" + ) return ' AND '.join(conditions) @@ -229,13 +243,16 @@ def __hash_to_sql(cls, column: str, sql_flavor: SQLFlavor) -> str: else: raise NotImplementedError( - f'HASH transformation in {sql_flavor.value} SQL flavor not implemented!') + f'HASH transformation in {sql_flavor.value} SQL flavor not implemented!' + ) return trans @classmethod # pylint: disable=W0238 # False positive when it is used by another classmethod - def __hash_skip_first_to_sql(cls, transform_type: TransformationType, column: str, sql_flavor: SQLFlavor) -> str: + def __hash_skip_first_to_sql( + cls, transform_type: TransformationType, column: str, sql_flavor: SQLFlavor + ) -> str: """ convert HASH-SKIP-FIRST-n transformation into the right sql string Args: @@ -251,16 +268,22 @@ def __hash_skip_first_to_sql(cls, transform_type: TransformationType, column: st if sql_flavor == SQLFlavor.SNOWFLAKE: trans = '{0} = CONCAT(SUBSTRING({0}, 1, {1}), SHA2(SUBSTRING({0}, {1} + 1), 256))'.format( - column, skip_first_n) + column, skip_first_n + ) elif sql_flavor == SQLFlavor.POSTGRES: - trans = '{0} = CONCAT(SUBSTRING({0}, 1, {1}), ENCODE(DIGEST(SUBSTRING({0}, {1} + 1), ' \ - '\'sha256\'), \'hex\'))'.format(column, skip_first_n) + trans = ( + '{0} = CONCAT(SUBSTRING({0}, 1, {1}), ENCODE(DIGEST(SUBSTRING({0}, {1} + 1), ' + '\'sha256\'), \'hex\'))'.format(column, skip_first_n) + ) elif sql_flavor == SQLFlavor.BIGQUERY: trans = '{0} = CONCAT(SUBSTRING({0}, 1, {1}), TO_BASE64(SHA256(SUBSTRING({0}, {1} + 1))))'.format( - column, skip_first_n) + column, skip_first_n + ) else: - raise NotImplementedError(f'HASH-SKIP-FIRST-{skip_first_n} transformation in {sql_flavor.value} SQL flavor ' - f'not implemented!') + raise NotImplementedError( + f'HASH-SKIP-FIRST-{skip_first_n} transformation in {sql_flavor.value} SQL flavor ' + f'not implemented!' + ) return trans @@ -278,24 +301,32 @@ def __mask_date_to_sql(cls, column: str, sql_flavor: SQLFlavor) -> str: Returns: sql string equivalent of the mask date """ if sql_flavor == SQLFlavor.SNOWFLAKE: - trans = f'{column} = TIMESTAMP_NTZ_FROM_PARTS(' \ - f'DATE_FROM_PARTS(YEAR({column}), 1, 1),' \ - f'TO_TIME({column}))' + trans = ( + f'{column} = TIMESTAMP_NTZ_FROM_PARTS(' + f'DATE_FROM_PARTS(YEAR({column}), 1, 1),' + f'TO_TIME({column}))' + ) elif sql_flavor == SQLFlavor.POSTGRES: - trans = '{0} = MAKE_TIMESTAMP(' \ - 'DATE_PART(\'year\', {0})::int, ' \ - '1, ' \ - '1, ' \ - 'DATE_PART(\'hour\', {0})::int, ' \ - 'DATE_PART(\'minute\', {0})::int, ' \ - 'DATE_PART(\'second\', {0})::double precision)'.format(column) + trans = ( + '{0} = MAKE_TIMESTAMP(' + 'DATE_PART(\'year\', {0})::int, ' + '1, ' + '1, ' + 'DATE_PART(\'hour\', {0})::int, ' + 'DATE_PART(\'minute\', {0})::int, ' + 'DATE_PART(\'second\', {0})::double precision)'.format(column) + ) elif sql_flavor == SQLFlavor.BIGQUERY: - trans = f'{column} = TIMESTAMP(DATETIME(' \ - f'DATE(EXTRACT(YEAR FROM {column}), 1, 1),' \ - f'TIME({column})))' + trans = ( + f'{column} = TIMESTAMP(DATETIME(' + f'DATE(EXTRACT(YEAR FROM {column}), 1, 1),' + f'TIME({column})))' + ) else: - raise NotImplementedError(f'MASK-DATE transformation in {sql_flavor.value} SQL flavor ' - f'not implemented!') + raise NotImplementedError( + f'MASK-DATE transformation in {sql_flavor.value} SQL flavor ' + f'not implemented!' + ) return trans diff --git a/pipelinewise/fastsync/commons/utils.py b/pipelinewise/fastsync/commons/utils.py index 3beb3efdb..761101c97 100644 --- a/pipelinewise/fastsync/commons/utils.py +++ b/pipelinewise/fastsync/commons/utils.py @@ -19,6 +19,7 @@ class NotSelectedTableException(Exception): """ Exception to raise when a table is not selected for resync """ + def __init__(self, table_name, selected_tables): self.message = f'Cannot Resync unselected table "{table_name}"! Selected tables are: {selected_tables}' super().__init__(self, self.message) @@ -70,7 +71,7 @@ def tablename_to_dict(table, separator='.'): 'catalog_name': catalog_name, 'schema_name': schema_name, 'table_name': table_name, - 'temp_table_name': '{}_temp'.format(table_name) + 'temp_table_name': '{}_temp'.format(table_name), } @@ -84,8 +85,14 @@ def get_tables_from_properties(properties: Dict) -> set: metadata = stream.get('metadata', []) table_name = stream.get('table_name', stream['stream']) - table_meta = next((i for i in metadata if isinstance(i, dict) and len(i.get('breadcrumb', [])) == 0), - {}).get('metadata') + table_meta = next( + ( + i + for i in metadata + if isinstance(i, dict) and len(i.get('breadcrumb', [])) == 0 + ), + {}, + ).get('metadata') selected = table_meta.get('selected', False) schema_name = table_meta.get('schema-name') db_name = table_meta.get('database-name') @@ -100,11 +107,7 @@ def get_tables_from_properties(properties: Dict) -> set: return tables -def get_bookmark_for_table( - table, - properties, - db_engine, - dbname=None): +def get_bookmark_for_table(table, properties, db_engine, dbname=None): """Get actual bookmark for a specific table used for LOG_BASED or INCREMENTAL replications """ @@ -116,24 +119,37 @@ def get_bookmark_for_table( table_name = stream.get('table_name', stream['stream']) # Get table specific metadata i.e. replication method, replication key, etc. - table_meta = next((i for i in metadata if isinstance(i, dict) and len(i.get('breadcrumb', [])) == 0), - {}).get('metadata') + table_meta = next( + ( + i + for i in metadata + if isinstance(i, dict) and len(i.get('breadcrumb', [])) == 0 + ), + {}, + ).get('metadata') db_name = table_meta.get('database-name') schema_name = table_meta.get('schema-name') replication_method = table_meta.get('replication-method') replication_key = table_meta.get('replication-key') - fully_qualified_table_name = '{}.{}'.format(schema_name or db_name, table_name) \ - if schema_name is not None or db_name is not None else table_name + fully_qualified_table_name = ( + '{}.{}'.format(schema_name or db_name, table_name) + if schema_name is not None or db_name is not None + else table_name + ) - if (dbname is None or db_name == dbname) and fully_qualified_table_name == table: + if ( + dbname is None or db_name == dbname + ) and fully_qualified_table_name == table: # Log based replication: get mysql binlog position if replication_method == 'LOG_BASED': bookmark = db_engine.fetch_current_log_pos() # Key based incremental replication: Get max replication key from source elif replication_method == 'INCREMENTAL': - bookmark = db_engine.fetch_current_incremental_key_pos(fully_qualified_table_name, replication_key) + bookmark = db_engine.fetch_current_incremental_key_pos( + fully_qualified_table_name, replication_key + ) break @@ -155,7 +171,9 @@ def get_target_schema(target_config, table): } """ target_schema = None - config_default_target_schema = target_config.get('default_target_schema', '').strip() + config_default_target_schema = target_config.get( + 'default_target_schema', '' + ).strip() config_schema_mapping = target_config.get('schema_mapping', {}) table_dict = tablename_to_dict(table) @@ -168,7 +186,8 @@ def get_target_schema(target_config, table): if not target_schema: raise Exception( "Target schema name not defined in config. Neither 'default_target_schema' (string) nor 'schema_mapping' " - '(object) defines target schema for {} stream. '.format(table)) + '(object) defines target schema for {} stream. '.format(table) + ) return target_schema @@ -202,13 +221,17 @@ def get_grantees(target_config, table): } """ grantees = [] - config_default_target_schema_select_permissions = target_config.get('default_target_schema_select_permissions', []) + config_default_target_schema_select_permissions = target_config.get( + 'default_target_schema_select_permissions', [] + ) config_schema_mapping = target_config.get('schema_mapping', {}) table_dict = tablename_to_dict(table) table_schema = table_dict['schema_name'] if config_schema_mapping and table_schema in config_schema_mapping: - grantees = config_schema_mapping[table_schema].get('target_schema_select_permissions', []) + grantees = config_schema_mapping[table_schema].get( + 'target_schema_select_permissions', [] + ) elif config_default_target_schema_select_permissions: grantees = config_default_target_schema_select_permissions @@ -248,9 +271,13 @@ def grant_privilege(schema, grantees, grant_method, to_group=False): def save_state_file(path, table, bookmark, dbname=None): table_dict = tablename_to_dict(table) if dbname: - stream_id = '{}-{}-{}'.format(dbname, table_dict.get('schema_name'), table_dict.get('table_name')) + stream_id = '{}-{}-{}'.format( + dbname, table_dict.get('schema_name'), table_dict.get('table_name') + ) elif table_dict['schema_name']: - stream_id = '{}-{}'.format(table_dict['schema_name'], table_dict.get('table_name')) + stream_id = '{}-{}'.format( + table_dict['schema_name'], table_dict.get('table_name') + ) else: stream_id = table_dict['table_name'] @@ -275,7 +302,6 @@ def save_state_file(path, table, bookmark, dbname=None): save_dict_to_json(path, state) - def parse_args(required_config_keys: Dict) -> argparse.Namespace: """Parse standard command-line args. @@ -299,8 +325,14 @@ def parse_args(required_config_keys: Dict) -> argparse.Namespace: parser.add_argument('--target', help='Target Config file', required=True) parser.add_argument('--transform', help='Transformations Config file') parser.add_argument('--tables', help='Sync only specific tables') - parser.add_argument('--temp_dir', help='Temporary directory required for CSV exports') - parser.add_argument('--drop_pg_slot', help='Drop pg replication slot before starting resync', action='store_true') + parser.add_argument( + '--temp_dir', help='Temporary directory required for CSV exports' + ) + parser.add_argument( + '--drop_pg_slot', + help='Drop pg replication slot before starting resync', + action='store_true', + ) args: argparse.Namespace = parser.parse_args() @@ -348,15 +380,20 @@ def retry_pattern(): import backoff from botocore.exceptions import ClientError - return backoff.on_exception(backoff.expo, - ClientError, - max_tries=5, - on_backoff=log_backoff_attempt, - factor=10) + return backoff.on_exception( + backoff.expo, + ClientError, + max_tries=5, + on_backoff=log_backoff_attempt, + factor=10, + ) def log_backoff_attempt(details): - LOGGER.error('Error detected communicating with Amazon, triggering backoff: %s try', details.get('tries')) + LOGGER.error( + 'Error detected communicating with Amazon, triggering backoff: %s try', + details.get('tries'), + ) def get_pool_size(tap: Dict) -> int: @@ -377,11 +414,9 @@ def get_pool_size(tap: Dict) -> int: return min(fastsync_parallelism, cpu_cores) -def gen_export_filename(tap_id: str, - table: str, - suffix: str = None, - postfix: str = None, - ext: str = None) -> str: +def gen_export_filename( + tap_id: str, table: str, suffix: str = None, postfix: str = None, ext: str = None +) -> str: """ Generates a unique filename used for exported fastsync data that avoids file name collision @@ -407,8 +442,6 @@ def gen_export_filename(tap_id: str, if not ext: ext = 'csv.gz' - return 'pipelinewise_{}_{}_{}_fastsync_{}.{}'.format(tap_id, - table, - suffix, - postfix, - ext) + return 'pipelinewise_{}_{}_{}_fastsync_{}.{}'.format( + tap_id, table, suffix, postfix, ext + ) diff --git a/pipelinewise/fastsync/mongodb_to_bigquery.py b/pipelinewise/fastsync/mongodb_to_bigquery.py index 9fc88b343..4e7b724ad 100644 --- a/pipelinewise/fastsync/mongodb_to_bigquery.py +++ b/pipelinewise/fastsync/mongodb_to_bigquery.py @@ -25,9 +25,7 @@ 'auth_database', 'dbname', ], - 'target': [ - 'project_id' - ] + 'target': ['project_id'], } LOCK = multiprocessing.Lock() @@ -53,7 +51,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: try: dbname = args.tap.get('dbname') - filename = 'pipelinewise_fastsync_{}_{}_{}.csv'.format(dbname, table, time.strftime('%Y%m%d-%H%M%S')) + filename = 'pipelinewise_fastsync_{}_{}_{}.csv'.format( + dbname, table, time.strftime('%Y%m%d-%H%M%S') + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -61,7 +61,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: mongodb.open_connection() # Get bookmark - LSN position or Incremental Key value - bookmark = utils.get_bookmark_for_table(table, args.properties, mongodb, dbname=dbname) + bookmark = utils.get_bookmark_for_table( + table, args.properties, mongodb, dbname=dbname + ) # Exporting table data, get table definitions and close connection to avoid timeouts mongodb.copy_table(table, filepath, args.temp_dir) @@ -75,7 +77,14 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: bigquery.create_table(target_schema, table, bigquery_columns, is_temporary=True) # Load into Bigquery table - bigquery.copy_to_table(filepath, target_schema, table, size_bytes, is_temporary=True, skip_csv_header=True) + bigquery.copy_to_table( + filepath, + target_schema, + table, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) os.remove(filepath) # Obfuscate columns @@ -113,7 +122,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -121,16 +131,25 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -141,8 +160,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/mongodb_to_postgres.py b/pipelinewise/fastsync/mongodb_to_postgres.py index 80003dde3..fe4643afb 100644 --- a/pipelinewise/fastsync/mongodb_to_postgres.py +++ b/pipelinewise/fastsync/mongodb_to_postgres.py @@ -24,12 +24,7 @@ 'auth_database', 'dbname', ], - 'target': [ - 'host', - 'port', - 'user', - 'password' - ] + 'target': ['host', 'port', 'user', 'password'], } LOCK = multiprocessing.Lock() @@ -54,7 +49,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: postgres = FastSyncTargetPostgres(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -62,7 +59,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: mongodb.open_connection() # Get bookmark - token of the most recent ChangeStream for logbased - bookmark = utils.get_bookmark_for_table(table, args.properties, mongodb, dbname=args.tap.get('dbname')) + bookmark = utils.get_bookmark_for_table( + table, args.properties, mongodb, dbname=args.tap.get('dbname') + ) # Exporting table data, get table definitions and close connection to avoid timeouts mongodb.copy_table(table, filepath, args.temp_dir) @@ -74,10 +73,19 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Postgres postgres.drop_table(target_schema, table, is_temporary=True) - postgres.create_table(target_schema, table, postgres_columns, primary_key, is_temporary=True) + postgres.create_table( + target_schema, table, postgres_columns, primary_key, is_temporary=True + ) # Load into Postgres table - postgres.copy_to_table(filepath, target_schema, table, size_bytes, is_temporary=True, skip_csv_header=True) + postgres.copy_to_table( + filepath, + target_schema, + table, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) os.remove(filepath) # Obfuscate columns @@ -114,7 +122,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -122,7 +131,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Create target schemas sequentially, Postgres doesn't like it running in parallel postgres_target = FastSyncTargetPostgres(args.target, args.transform) @@ -131,11 +144,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -146,8 +164,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/mongodb_to_snowflake.py b/pipelinewise/fastsync/mongodb_to_snowflake.py index 2ffe06cc0..96fa606c1 100644 --- a/pipelinewise/fastsync/mongodb_to_snowflake.py +++ b/pipelinewise/fastsync/mongodb_to_snowflake.py @@ -32,8 +32,8 @@ 'warehouse', 's3_bucket', 'stage', - 'file_format' - ] + 'file_format', + ], } LOCK = multiprocessing.Lock() @@ -69,7 +69,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: mongodb.open_connection() # Get bookmark - LSN position or Incremental Key value - bookmark = utils.get_bookmark_for_table(table, args.properties, mongodb, dbname=dbname) + bookmark = utils.get_bookmark_for_table( + table, args.properties, mongodb, dbname=dbname + ) # Exporting table data, get table definitions and close connection to avoid timeouts mongodb.copy_table(table, filepath, args.temp_dir) @@ -85,10 +87,19 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Snowflake snowflake.create_schema(target_schema) - snowflake.create_table(target_schema, table, snowflake_columns, primary_key, is_temporary=True) + snowflake.create_table( + target_schema, table, snowflake_columns, primary_key, is_temporary=True + ) # Load into Snowflake table - snowflake.copy_to_table(s3_key, target_schema, table, size_bytes, is_temporary=True, skip_csv_header=True) + snowflake.copy_to_table( + s3_key, + target_schema, + table, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) if archive_load_files: # Copy load file to archive @@ -132,7 +143,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -140,16 +152,25 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -160,8 +181,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/mysql_to_bigquery.py b/pipelinewise/fastsync/mysql_to_bigquery.py index 4fa2e92ad..b8529f39d 100644 --- a/pipelinewise/fastsync/mysql_to_bigquery.py +++ b/pipelinewise/fastsync/mysql_to_bigquery.py @@ -14,20 +14,15 @@ from .commons.tap_mysql import FastSyncTapMySql from .commons.target_bigquery import FastSyncTargetBigquery -MAX_NUM='99999999999999999999999999999.999999999' +MAX_NUM = '99999999999999999999999999999.999999999' LOGGER = logging.getLogger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'host', - 'port', - 'user', - 'password' - ], + 'tap': ['host', 'port', 'user', 'password'], 'target': [ 'project_id', - ] + ], } LOCK = multiprocessing.Lock() @@ -36,35 +31,35 @@ def tap_type_to_target_type(mysql_type, mysql_column_type): """Data type mapping from MySQL to Bigquery""" return { - 'char':'STRING', - 'varchar':'STRING', - 'binary':'STRING', - 'varbinary':'STRING', - 'blob':'STRING', - 'tinyblob':'STRING', - 'mediumblob':'STRING', - 'longblob':'STRING', - 'geometry':'STRING', - 'text':'STRING', - 'tinytext':'STRING', - 'mediumtext':'STRING', - 'longtext':'STRING', - 'enum':'STRING', - 'int':'INT64', - 'tinyint':'BOOL' if mysql_column_type == 'tinyint(1)' else 'INT64', - 'smallint':'INT64', - 'mediumint':'INT64', - 'bigint':'INT64', - 'bit':'BOOL', - 'decimal':'NUMERIC', - 'double':'NUMERIC', - 'float':'NUMERIC', - 'bool':'BOOL', - 'boolean':'BOOL', - 'date':'TIMESTAMP', - 'datetime':'TIMESTAMP', - 'timestamp':'TIMESTAMP', - 'time':'TIME' + 'char': 'STRING', + 'varchar': 'STRING', + 'binary': 'STRING', + 'varbinary': 'STRING', + 'blob': 'STRING', + 'tinyblob': 'STRING', + 'mediumblob': 'STRING', + 'longblob': 'STRING', + 'geometry': 'STRING', + 'text': 'STRING', + 'tinytext': 'STRING', + 'mediumtext': 'STRING', + 'longtext': 'STRING', + 'enum': 'STRING', + 'int': 'INT64', + 'tinyint': 'BOOL' if mysql_column_type == 'tinyint(1)' else 'INT64', + 'smallint': 'INT64', + 'mediumint': 'INT64', + 'bigint': 'INT64', + 'bit': 'BOOL', + 'decimal': 'NUMERIC', + 'double': 'NUMERIC', + 'float': 'NUMERIC', + 'bool': 'BOOL', + 'boolean': 'BOOL', + 'date': 'TIMESTAMP', + 'datetime': 'TIMESTAMP', + 'timestamp': 'TIMESTAMP', + 'time': 'TIME', }.get(mysql_type, 'STRING') @@ -75,7 +70,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: bigquery = FastSyncTargetBigquery(args.target, args.transform) try: - filename = 'pipelinewise_fastsync_{}_{}.csv'.format(table, time.strftime('%Y%m%d-%H%M%S')) + filename = 'pipelinewise_fastsync_{}_{}.csv'.format( + table, time.strftime('%Y%m%d-%H%M%S') + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -86,11 +83,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: bookmark = utils.get_bookmark_for_table(table, args.properties, mysql) # Exporting table data, get table definitions and close connection to avoid timeouts - mysql.copy_table(table, - filepath, - compress=False, - max_num=MAX_NUM, - date_type='datetime') + mysql.copy_table( + table, filepath, compress=False, max_num=MAX_NUM, date_type='datetime' + ) file_parts = glob.glob(f'{filepath}*') size_bytes = sum([os.path.getsize(file_part) for file_part in file_parts]) bigquery_types = mysql.map_column_types_to_target(table) @@ -110,7 +105,8 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: table, size_bytes, is_temporary=True, - write_truncate=write_truncate) + write_truncate=write_truncate, + ) os.remove(file_part) # Obfuscate columns @@ -148,7 +144,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -156,16 +153,25 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -176,8 +182,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/mysql_to_postgres.py b/pipelinewise/fastsync/mysql_to_postgres.py index 0821c98a6..20d8d2ecb 100644 --- a/pipelinewise/fastsync/mysql_to_postgres.py +++ b/pipelinewise/fastsync/mysql_to_postgres.py @@ -15,18 +15,8 @@ LOGGER = Logger().get_logger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'host', - 'port', - 'user', - 'password' - ], - 'target': [ - 'host', - 'port', - 'user', - 'password' - ] + 'tap': ['host', 'port', 'user', 'password'], + 'target': ['host', 'port', 'user', 'password'], } LOCK = multiprocessing.Lock() @@ -57,7 +47,9 @@ def tap_type_to_target_type(mysql_type, mysql_column_type): 'longtext': 'CHARACTER VARYING', 'enum': 'CHARACTER VARYING', 'int': 'INTEGER NULL', - 'tinyint': 'BOOLEAN' if mysql_column_type and mysql_column_type.startswith('tinyint(1)') else 'SMALLINT NULL', + 'tinyint': 'BOOLEAN' + if mysql_column_type and mysql_column_type.startswith('tinyint(1)') + else 'SMALLINT NULL', 'smallint': 'SMALLINT NULL', 'mediumint': 'INTEGER NULL', 'bigint': 'BIGINT NULL', @@ -84,7 +76,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: postgres = FastSyncTargetPostgres(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -104,10 +98,14 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Postgres postgres.drop_table(target_schema, table, is_temporary=True) - postgres.create_table(target_schema, table, postgres_columns, primary_key, is_temporary=True) + postgres.create_table( + target_schema, table, postgres_columns, primary_key, is_temporary=True + ) # Load into Postgres table - postgres.copy_to_table(filepath, target_schema, table, size_bytes, is_temporary=True) + postgres.copy_to_table( + filepath, target_schema, table, size_bytes, is_temporary=True + ) os.remove(filepath) # Obfuscate columns @@ -144,7 +142,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -152,7 +151,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Create target schemas sequentially, Postgres doesn't like it running in parallel postgres_target = FastSyncTargetPostgres(args.target, args.transform) @@ -161,11 +164,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -176,8 +184,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/mysql_to_redshift.py b/pipelinewise/fastsync/mysql_to_redshift.py index 8bb948f32..94ac6afbd 100644 --- a/pipelinewise/fastsync/mysql_to_redshift.py +++ b/pipelinewise/fastsync/mysql_to_redshift.py @@ -15,20 +15,8 @@ LOGGER = Logger().get_logger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'host', - 'port', - 'user', - 'password' - ], - 'target': [ - 'host', - 'port', - 'user', - 'password', - 'dbname', - 's3_bucket' - ] + 'tap': ['host', 'port', 'user', 'password'], + 'target': ['host', 'port', 'user', 'password', 'dbname', 's3_bucket'], } DEFAULT_VARCHAR_LENGTH = 10000 @@ -63,7 +51,9 @@ def tap_type_to_target_type(mysql_type, mysql_column_type): 'longtext': 'CHARACTER VARYING({})'.format(LONG_VARCHAR_LENGTH), 'enum': 'CHARACTER VARYING({})'.format(DEFAULT_VARCHAR_LENGTH), 'int': 'NUMERIC NULL', - 'tinyint': 'BOOLEAN' if mysql_column_type and mysql_column_type.startswith('tinyint(1)') else 'NUMERIC NULL', + 'tinyint': 'BOOLEAN' + if mysql_column_type and mysql_column_type.startswith('tinyint(1)') + else 'NUMERIC NULL', 'smallint': 'NUMERIC NULL', 'mediumint': 'NUMERIC NULL', 'bigint': 'NUMERIC NULL', @@ -76,7 +66,7 @@ def tap_type_to_target_type(mysql_type, mysql_column_type): 'date': 'TIMESTAMP WITHOUT TIME ZONE', 'datetime': 'TIMESTAMP WITHOUT TIME ZONE', 'timestamp': 'TIMESTAMP WITHOUT TIME ZONE', - 'json': 'CHARACTER VARYING({})'.format(LONG_VARCHAR_LENGTH) + 'json': 'CHARACTER VARYING({})'.format(LONG_VARCHAR_LENGTH), }.get( mysql_type, 'CHARACTER VARYING({})'.format(DEFAULT_VARCHAR_LENGTH), @@ -89,7 +79,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: redshift = FastSyncTargetRedshift(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -113,10 +105,14 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Redshift redshift.drop_table(target_schema, table, is_temporary=True) - redshift.create_table(target_schema, table, redshift_columns, primary_key, is_temporary=True) + redshift.create_table( + target_schema, table, redshift_columns, primary_key, is_temporary=True + ) # Load into Redshift table - redshift.copy_to_table(s3_key, target_schema, table, size_bytes, is_temporary=True) + redshift.copy_to_table( + s3_key, target_schema, table, size_bytes, is_temporary=True + ) # Obfuscate columns redshift.obfuscate_columns(target_schema, table) @@ -152,7 +148,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -160,7 +157,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Create target schemas sequentially, Redshift doesn't like it running in parallel redshift = FastSyncTargetRedshift(args.target, args.transform) @@ -169,11 +170,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -184,8 +190,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/mysql_to_snowflake.py b/pipelinewise/fastsync/mysql_to_snowflake.py index 2b37ffd4e..2eb722e17 100644 --- a/pipelinewise/fastsync/mysql_to_snowflake.py +++ b/pipelinewise/fastsync/mysql_to_snowflake.py @@ -17,12 +17,7 @@ LOGGER = Logger().get_logger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'host', - 'port', - 'user', - 'password' - ], + 'tap': ['host', 'port', 'user', 'password'], 'target': [ 'account', 'dbname', @@ -31,8 +26,8 @@ 'warehouse', 's3_bucket', 'stage', - 'file_format' - ] + 'file_format', + ], } LOCK = multiprocessing.Lock() @@ -63,7 +58,9 @@ def tap_type_to_target_type(mysql_type, mysql_column_type): 'longtext': 'VARCHAR', 'enum': 'VARCHAR', 'int': 'NUMBER', - 'tinyint': 'BOOLEAN' if mysql_column_type and mysql_column_type.startswith('tinyint(1)') else 'NUMBER', + 'tinyint': 'BOOLEAN' + if mysql_column_type and mysql_column_type.startswith('tinyint(1)') + else 'NUMBER', 'smallint': 'NUMBER', 'mediumint': 'NUMBER', 'bigint': 'NUMBER', @@ -77,7 +74,7 @@ def tap_type_to_target_type(mysql_type, mysql_column_type): 'datetime': 'TIMESTAMP_NTZ', 'timestamp': 'TIMESTAMP_NTZ', 'time': 'TIME', - 'json': 'VARIANT' + 'json': 'VARIANT', }.get(mysql_type, 'VARCHAR') @@ -89,7 +86,6 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: tap_id = args.target.get('tap_id') archive_load_files = args.target.get('archive_load_files', False) - try: filename = utils.gen_export_filename(tap_id=tap_id, table=table) filepath = os.path.join(args.temp_dir, filename) @@ -102,11 +98,13 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: bookmark = utils.get_bookmark_for_table(table, args.properties, mysql) # Exporting table data, get table definitions and close connection to avoid timeouts - mysql.copy_table(table, - filepath, - split_large_files=args.target.get('split_large_files'), - split_file_chunk_size_mb=args.target.get('split_file_chunk_size_mb'), - split_file_max_chunks=args.target.get('split_file_max_chunks')) + mysql.copy_table( + table, + filepath, + split_large_files=args.target.get('split_large_files'), + split_file_chunk_size_mb=args.target.get('split_file_chunk_size_mb'), + split_file_max_chunks=args.target.get('split_file_max_chunks'), + ) file_parts = glob.glob(f'{filepath}*') size_bytes = sum([os.path.getsize(file_part) for file_part in file_parts]) snowflake_types = mysql.map_column_types_to_target(table) @@ -121,14 +119,22 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: os.remove(file_part) # Create a pattern that match all file parts by removing multipart suffix - s3_key_pattern = re.sub(r'\.part\d*$', '', s3_keys[0]) if len(s3_keys) > 0 else 'NO_FILES_TO_LOAD' + s3_key_pattern = ( + re.sub(r'\.part\d*$', '', s3_keys[0]) + if len(s3_keys) > 0 + else 'NO_FILES_TO_LOAD' + ) # Creating temp table in Snowflake snowflake.create_schema(target_schema) - snowflake.create_table(target_schema, table, snowflake_columns, primary_key, is_temporary=True) + snowflake.create_table( + target_schema, table, snowflake_columns, primary_key, is_temporary=True + ) # Load into Snowflake table - snowflake.copy_to_table(s3_key_pattern, target_schema, table, size_bytes, is_temporary=True) + snowflake.copy_to_table( + s3_key_pattern, target_schema, table, size_bytes, is_temporary=True + ) for s3_key in s3_keys: if archive_load_files: @@ -173,7 +179,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -181,16 +188,25 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -201,8 +217,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/postgres_to_bigquery.py b/pipelinewise/fastsync/postgres_to_bigquery.py index 3e75ea5e5..8bc207886 100644 --- a/pipelinewise/fastsync/postgres_to_bigquery.py +++ b/pipelinewise/fastsync/postgres_to_bigquery.py @@ -14,58 +14,52 @@ from .commons.tap_postgres import FastSyncTapPostgres from .commons.target_bigquery import FastSyncTargetBigquery -MAX_NUM='99999999999999999999999999999.999999999' +MAX_NUM = '99999999999999999999999999999.999999999' LOGGER = logging.getLogger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'host', - 'port', - 'user', - 'password' - ], - 'target': [ - 'project_id' - ] + 'tap': ['host', 'port', 'user', 'password'], + 'target': ['project_id'], } LOCK = multiprocessing.Lock() + def tap_type_to_target_type(pg_type): """Data type mapping from Postgres to Bigquery""" return { - 'char':'STRING', - 'character':'STRING', - 'varchar':'STRING', - 'character varying':'STRING', - 'text':'STRING', + 'char': 'STRING', + 'character': 'STRING', + 'varchar': 'STRING', + 'character varying': 'STRING', + 'text': 'STRING', 'bit': ['BOOL', 'NUMERIC'], - 'varbit':'NUMERIC', - 'bit varying':'NUMERIC', - 'smallint':'INT64', - 'int':'INT64', - 'integer':'INT64', - 'bigint':'INT64', - 'smallserial':'INT64', - 'serial':'INT64', - 'bigserial':'INT64', - 'numeric':'NUMERIC', - 'double precision':'NUMERIC', - 'real':'NUMERIC', - 'bool':'BOOL', - 'boolean':'BOOL', - 'date':'TIMESTAMP', - 'timestamp':'TIMESTAMP', - 'timestamp without time zone':'TIMESTAMP', - 'timestamp with time zone':'TIMESTAMP', - 'time':'TIME', - 'time without time zone':'TIME', - 'time with time zone':'TIME', + 'varbit': 'NUMERIC', + 'bit varying': 'NUMERIC', + 'smallint': 'INT64', + 'int': 'INT64', + 'integer': 'INT64', + 'bigint': 'INT64', + 'smallserial': 'INT64', + 'serial': 'INT64', + 'bigserial': 'INT64', + 'numeric': 'NUMERIC', + 'double precision': 'NUMERIC', + 'real': 'NUMERIC', + 'bool': 'BOOL', + 'boolean': 'BOOL', + 'date': 'TIMESTAMP', + 'timestamp': 'TIMESTAMP', + 'timestamp without time zone': 'TIMESTAMP', + 'timestamp with time zone': 'TIMESTAMP', + 'time': 'TIME', + 'time without time zone': 'TIME', + 'time with time zone': 'TIME', # This is all uppercase, because postgres stores it in this format in information_schema.columns.data_type - 'ARRAY':'STRING', - 'json':'STRING', - 'jsonb':'STRING' + 'ARRAY': 'STRING', + 'json': 'STRING', + 'jsonb': 'STRING', }.get(pg_type, 'STRING') @@ -77,7 +71,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: try: dbname = args.tap.get('dbname') - filename = 'pipelinewise_fastsync_{}_{}_{}.csv'.format(dbname, table, time.strftime('%Y%m%d-%H%M%S')) + filename = 'pipelinewise_fastsync_{}_{}_{}.csv'.format( + dbname, table, time.strftime('%Y%m%d-%H%M%S') + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -85,14 +81,14 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: postgres.open_connection() # Get bookmark - LSN position or Incremental Key value - bookmark = utils.get_bookmark_for_table(table, args.properties, postgres, dbname=dbname) + bookmark = utils.get_bookmark_for_table( + table, args.properties, postgres, dbname=dbname + ) # Exporting table data, get table definitions and close connection to avoid timeouts - postgres.copy_table(table, - filepath, - compress=False, - max_num=MAX_NUM, - date_type='timestamp') + postgres.copy_table( + table, filepath, compress=False, max_num=MAX_NUM, date_type='timestamp' + ) file_parts = glob.glob(f'{filepath}*') size_bytes = sum([os.path.getsize(file_part) for file_part in file_parts]) @@ -113,7 +109,8 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: table, size_bytes, is_temporary=True, - write_truncate=write_truncate) + write_truncate=write_truncate, + ) os.remove(file_part) # Obfuscate columns @@ -151,7 +148,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -159,7 +157,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # if internal arg drop_pg_slot is set to True, then we drop the slot before starting resync if args.drop_pg_slot: @@ -168,11 +170,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -183,8 +190,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/postgres_to_postgres.py b/pipelinewise/fastsync/postgres_to_postgres.py index fd4e1e2a3..728d0c140 100644 --- a/pipelinewise/fastsync/postgres_to_postgres.py +++ b/pipelinewise/fastsync/postgres_to_postgres.py @@ -22,14 +22,9 @@ 'user', 'password', 'dbname', - 'tap_id' # tap_id is required to generate unique replication slot names + 'tap_id', # tap_id is required to generate unique replication slot names ], - 'target': [ - 'host', - 'port', - 'user', - 'password' - ] + 'target': ['host', 'port', 'user', 'password'], } LOCK = multiprocessing.Lock() @@ -68,7 +63,7 @@ def tap_type_to_target_type(pg_type): # ARRAY is uppercase, because postgres stores it in this format in information_schema.columns.data_type 'ARRAY': 'JSONB', 'json': 'JSONB', - 'jsonb': 'JSONB' + 'jsonb': 'JSONB', }.get(pg_type, 'CHARACTER VARYING') @@ -79,7 +74,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: try: dbname = args.tap.get('dbname') - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -87,7 +84,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: postgres.open_connection() # Get bookmark - LSN position or Incremental Key value - bookmark = utils.get_bookmark_for_table(table, args.properties, postgres, dbname=dbname) + bookmark = utils.get_bookmark_for_table( + table, args.properties, postgres, dbname=dbname + ) # Exporting table data, get table definitions and close connection to avoid timeouts postgres.copy_table(table, filepath) @@ -99,10 +98,18 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Postgres postgres_target.drop_table(target_schema, table, is_temporary=True) - postgres_target.create_table(target_schema, table, postgres_target_columns, primary_key, is_temporary=True) + postgres_target.create_table( + target_schema, + table, + postgres_target_columns, + primary_key, + is_temporary=True, + ) # Load into Postgres table - postgres_target.copy_to_table(filepath, target_schema, table, size_bytes, is_temporary=True) + postgres_target.copy_to_table( + filepath, target_schema, table, size_bytes, is_temporary=True + ) os.remove(filepath) # Obfuscate columns @@ -121,8 +128,12 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Table loaded, grant select on all tables in target schema grantees = utils.get_grantees(args.target, table) - utils.grant_privilege(target_schema, grantees, postgres_target.grant_usage_on_schema) - utils.grant_privilege(target_schema, grantees, postgres_target.grant_select_on_schema) + utils.grant_privilege( + target_schema, grantees, postgres_target.grant_usage_on_schema + ) + utils.grant_privilege( + target_schema, grantees, postgres_target.grant_select_on_schema + ) return True @@ -139,7 +150,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -147,7 +159,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # if internal arg drop_pg_slot is set to True, then we drop the slot before starting resync if args.drop_pg_slot: @@ -160,11 +176,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -175,8 +196,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/postgres_to_redshift.py b/pipelinewise/fastsync/postgres_to_redshift.py index f47a1dcba..ab45de889 100644 --- a/pipelinewise/fastsync/postgres_to_redshift.py +++ b/pipelinewise/fastsync/postgres_to_redshift.py @@ -16,20 +16,8 @@ LOGGER = Logger().get_logger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'host', - 'port', - 'user', - 'password' - ], - 'target': [ - 'host', - 'port', - 'user', - 'password', - 'dbname', - 's3_bucket' - ] + 'tap': ['host', 'port', 'user', 'password'], + 'target': ['host', 'port', 'user', 'password', 'dbname', 's3_bucket'], } DEFAULT_VARCHAR_LENGTH = 10000 @@ -84,7 +72,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: try: dbname = args.tap.get('dbname') - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -92,7 +82,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: postgres.open_connection() # Get bookmark - LSN position or Incremental Key value - bookmark = utils.get_bookmark_for_table(table, args.properties, postgres, dbname=dbname) + bookmark = utils.get_bookmark_for_table( + table, args.properties, postgres, dbname=dbname + ) # Exporting table data, get table definitions and close connection to avoid timeouts postgres.copy_table(table, filepath) @@ -108,10 +100,14 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Redshift redshift.drop_table(target_schema, table, is_temporary=True) - redshift.create_table(target_schema, table, redshift_columns, primary_key, is_temporary=True) + redshift.create_table( + target_schema, table, redshift_columns, primary_key, is_temporary=True + ) # Load into Redshift table - redshift.copy_to_table(s3_key, target_schema, table, size_bytes, is_temporary=True) + redshift.copy_to_table( + s3_key, target_schema, table, size_bytes, is_temporary=True + ) # Obfuscate columns redshift.obfuscate_columns(target_schema, table) @@ -147,7 +143,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -155,7 +152,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # if internal arg drop_pg_slot is set to True, then we drop the slot before starting resync if args.drop_pg_slot: @@ -168,11 +169,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -183,8 +189,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/postgres_to_snowflake.py b/pipelinewise/fastsync/postgres_to_snowflake.py index 3d9d4eca2..867f2b96a 100644 --- a/pipelinewise/fastsync/postgres_to_snowflake.py +++ b/pipelinewise/fastsync/postgres_to_snowflake.py @@ -24,7 +24,7 @@ 'user', 'password', 'dbname', - 'tap_id' # tap_id is required to generate unique replication slot names + 'tap_id', # tap_id is required to generate unique replication slot names ], 'target': [ 'account', @@ -34,8 +34,8 @@ 'warehouse', 's3_bucket', 'stage', - 'file_format' - ] + 'file_format', + ], } LOCK = multiprocessing.Lock() @@ -74,7 +74,7 @@ def tap_type_to_target_type(pg_type): # ARRAY is uppercase, because postgres stores it in this format in information_schema.columns.data_type 'ARRAY': 'VARIANT', 'json': 'VARIANT', - 'jsonb': 'VARIANT' + 'jsonb': 'VARIANT', }.get(pg_type, 'VARCHAR') @@ -96,14 +96,18 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: postgres.open_connection() # Get bookmark - LSN position or Incremental Key value - bookmark = utils.get_bookmark_for_table(table, args.properties, postgres, dbname=dbname) + bookmark = utils.get_bookmark_for_table( + table, args.properties, postgres, dbname=dbname + ) # Exporting table data, get table definitions and close connection to avoid timeouts - postgres.copy_table(table, - filepath, - split_large_files=args.target.get('split_large_files'), - split_file_chunk_size_mb=args.target.get('split_file_chunk_size_mb'), - split_file_max_chunks=args.target.get('split_file_max_chunks')) + postgres.copy_table( + table, + filepath, + split_large_files=args.target.get('split_large_files'), + split_file_chunk_size_mb=args.target.get('split_file_chunk_size_mb'), + split_file_max_chunks=args.target.get('split_file_max_chunks'), + ) file_parts = glob.glob(f'{filepath}*') size_bytes = sum([os.path.getsize(file_part) for file_part in file_parts]) snowflake_types = postgres.map_column_types_to_target(table) @@ -118,14 +122,22 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: os.remove(file_part) # Create a pattern that match all file parts by removing multipart suffix - s3_key_pattern = re.sub(r'\.part\d*$', '', s3_keys[0]) if len(s3_keys) > 0 else 'NO_FILES_TO_LOAD' + s3_key_pattern = ( + re.sub(r'\.part\d*$', '', s3_keys[0]) + if len(s3_keys) > 0 + else 'NO_FILES_TO_LOAD' + ) # Creating temp table in Snowflake snowflake.create_schema(target_schema) - snowflake.create_table(target_schema, table, snowflake_columns, primary_key, is_temporary=True) + snowflake.create_table( + target_schema, table, snowflake_columns, primary_key, is_temporary=True + ) # Load into Snowflake table - snowflake.copy_to_table(s3_key_pattern, target_schema, table, size_bytes, is_temporary=True) + snowflake.copy_to_table( + s3_key_pattern, target_schema, table, size_bytes, is_temporary=True + ) for s3_key in s3_keys: if archive_load_files: @@ -170,7 +182,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -178,7 +191,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # if internal arg drop_pg_slot is set to True, then we drop the slot before starting resync if args.drop_pg_slot: @@ -187,11 +204,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -202,8 +224,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/s3_csv_to_bigquery.py b/pipelinewise/fastsync/s3_csv_to_bigquery.py index 859f582d9..7c81c30fe 100644 --- a/pipelinewise/fastsync/s3_csv_to_bigquery.py +++ b/pipelinewise/fastsync/s3_csv_to_bigquery.py @@ -16,14 +16,10 @@ REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'bucket', - 'start_date' - ], + 'tap': ['bucket', 'start_date'], 'target': [ - 'project_id', - ] + ], } LOCK = multiprocessing.Lock() @@ -38,8 +34,7 @@ def tap_type_to_target_type(csv_type): 'string': 'STRING', 'boolean': 'STRING', # The guess sometimes can be wrong, we'll use string for now. 'date': 'STRING', # The guess sometimes can be wrong, we'll use string for now. - - 'date_override': 'TIMESTAMP' # Column type to use when date_override defined in YAML + 'date_override': 'TIMESTAMP', # Column type to use when date_override defined in YAML }.get(csv_type, 'STRING') @@ -49,7 +44,9 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: bigquery = FastSyncTargetBigquery(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table_name) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table_name + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table_name) @@ -62,14 +59,23 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Bigquery bigquery.create_schema(target_schema) - bigquery.create_table(target_schema, - table_name, - bigquery_columns, - is_temporary=True, - sort_columns=True) + bigquery.create_table( + target_schema, + table_name, + bigquery_columns, + is_temporary=True, + sort_columns=True, + ) # Load into Bigquery table - bigquery.copy_to_table(filepath, target_schema, table_name, size_bytes, is_temporary=True, skip_csv_header=True) + bigquery.copy_to_table( + filepath, + target_schema, + table_name, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) os.remove(filepath) # Obfuscate columns @@ -109,7 +115,8 @@ def main_impl(): start_time = datetime.now() # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -117,17 +124,26 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Start loading tables in parallel in spawning processes by # utilising all available Pool size with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -138,8 +154,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), - str(table_sync_excs), pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/s3_csv_to_postgres.py b/pipelinewise/fastsync/s3_csv_to_postgres.py index 4b8a9ed67..6ac9639f6 100644 --- a/pipelinewise/fastsync/s3_csv_to_postgres.py +++ b/pipelinewise/fastsync/s3_csv_to_postgres.py @@ -15,16 +15,8 @@ LOGGER = Logger().get_logger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'bucket', - 'start_date' - ], - 'target': [ - 'host', - 'port', - 'user', - 'password' - ] + 'tap': ['bucket', 'start_date'], + 'target': ['host', 'port', 'user', 'password'], } LOCK = multiprocessing.Lock() @@ -39,8 +31,7 @@ def tap_type_to_target_type(csv_type): 'string': 'CHARACTER VARYING', 'boolean': 'CHARACTER VARYING', # The guess sometimes can be wrong, we'll use varchar for now. 'date': 'CHARACTER VARYING', # The guess sometimes can be wrong, we'll use varchar for now. - - 'date_override': 'TIMESTAMP WITHOUT TIME ZONE' # Column type to use when date_override defined in YAML + 'date_override': 'TIMESTAMP WITHOUT TIME ZONE', # Column type to use when date_override defined in YAML }.get(csv_type, 'CHARACTER VARYING') @@ -50,7 +41,9 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: postgres = FastSyncTargetPostgres(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table_name) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table_name + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table_name) @@ -64,15 +57,24 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Postgres postgres.drop_table(target_schema, table_name, is_temporary=True) - postgres.create_table(target_schema, - table_name, - postgres_columns, - primary_key, - is_temporary=True, - sort_columns=True) + postgres.create_table( + target_schema, + table_name, + postgres_columns, + primary_key, + is_temporary=True, + sort_columns=True, + ) # Load into Postgres table - postgres.copy_to_table(filepath, target_schema, table_name, size_bytes, is_temporary=True, skip_csv_header=True) + postgres.copy_to_table( + filepath, + target_schema, + table_name, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) os.remove(filepath) # Obfuscate columns @@ -111,7 +113,8 @@ def main_impl(): start_time = datetime.now() # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -119,7 +122,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Create target schemas sequentially, Postgres doesn't like it running in parallel postgres_target = FastSyncTargetPostgres(args.target, args.transform) @@ -128,11 +135,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -143,8 +155,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), - str(table_sync_excs), pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/s3_csv_to_redshift.py b/pipelinewise/fastsync/s3_csv_to_redshift.py index 0e41f7d93..b65ac6fe0 100644 --- a/pipelinewise/fastsync/s3_csv_to_redshift.py +++ b/pipelinewise/fastsync/s3_csv_to_redshift.py @@ -15,18 +15,8 @@ LOGGER = Logger().get_logger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'bucket', - 'start_date' - ], - 'target': [ - 'host', - 'port', - 'user', - 'password', - 'dbname', - 's3_bucket' - ] + 'tap': ['bucket', 'start_date'], + 'target': ['host', 'port', 'user', 'password', 'dbname', 's3_bucket'], } LOCK = multiprocessing.Lock() @@ -41,8 +31,7 @@ def tap_type_to_target_type(csv_type): 'string': 'CHARACTER VARYING', 'boolean': 'CHARACTER VARYING', # The guess sometimes can be wrong, we'll use varchar for now. 'date': 'CHARACTER VARYING', # The guess sometimes can be wrong, we'll use varchar for now. - - 'date_override': 'TIMESTAMP WITHOUT TIME ZONE' # Column type to use when date_override defined in YAML + 'date_override': 'TIMESTAMP WITHOUT TIME ZONE', # Column type to use when date_override defined in YAML }.get(csv_type, 'CHARACTER VARYING') @@ -52,7 +41,9 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: redshift = FastSyncTargetRedshift(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table_name) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table_name + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table_name) @@ -70,15 +61,24 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Redshift redshift.create_schema(target_schema) - redshift.create_table(target_schema, - table_name, - redshift_columns, - primary_key, - is_temporary=True, - sort_columns=True) + redshift.create_table( + target_schema, + table_name, + redshift_columns, + primary_key, + is_temporary=True, + sort_columns=True, + ) # Load into Redshift table - redshift.copy_to_table(s3_key, target_schema, table_name, size_bytes, is_temporary=True, skip_csv_header=True) + redshift.copy_to_table( + s3_key, + target_schema, + table_name, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) # Obfuscate columns redshift.obfuscate_columns(target_schema, table_name) @@ -116,7 +116,8 @@ def main_impl(): start_time = datetime.now() # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -124,7 +125,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Create target schemas sequentially, Redshift doesn't like it running in parallel redshift = FastSyncTargetRedshift(args.target, args.transform) @@ -133,11 +138,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -148,8 +158,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), - str(table_sync_excs), pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/s3_csv_to_snowflake.py b/pipelinewise/fastsync/s3_csv_to_snowflake.py index 04f8a2b95..84c0268e4 100644 --- a/pipelinewise/fastsync/s3_csv_to_snowflake.py +++ b/pipelinewise/fastsync/s3_csv_to_snowflake.py @@ -16,10 +16,7 @@ REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'bucket', - 'start_date' - ], + 'tap': ['bucket', 'start_date'], 'target': [ 'account', 'dbname', @@ -28,8 +25,8 @@ 'warehouse', 's3_bucket', 'stage', - 'file_format' - ] + 'file_format', + ], } LOCK = multiprocessing.Lock() @@ -44,8 +41,7 @@ def tap_type_to_target_type(csv_type): 'string': 'VARCHAR', 'boolean': 'VARCHAR', # The guess sometimes can be wrong, we'll use varchar for now. 'date': 'VARCHAR', # The guess sometimes can be wrong, we'll use varchar for now. - - 'date_override': 'TIMESTAMP_NTZ' # Column type to use when date_override defined in YAML + 'date_override': 'TIMESTAMP_NTZ', # Column type to use when date_override defined in YAML }.get(csv_type, 'VARCHAR') @@ -55,7 +51,9 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: snowflake = FastSyncTargetSnowflake(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table_name) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table_name + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table_name) @@ -73,21 +71,32 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Snowflake snowflake.create_schema(target_schema) - snowflake.create_table(target_schema, - table_name, - snowflake_columns, - primary_key, - is_temporary=True, - sort_columns=True) + snowflake.create_table( + target_schema, + table_name, + snowflake_columns, + primary_key, + is_temporary=True, + sort_columns=True, + ) # Load into Snowflake table - snowflake.copy_to_table(s3_key, target_schema, table_name, size_bytes, is_temporary=True, skip_csv_header=True) + snowflake.copy_to_table( + s3_key, + target_schema, + table_name, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) # Obfuscate columns snowflake.obfuscate_columns(target_schema, table_name) # Create target table and swap with the temp table in Snowflake - snowflake.create_table(target_schema, table_name, snowflake_columns, primary_key, sort_columns=True) + snowflake.create_table( + target_schema, table_name, snowflake_columns, primary_key, sort_columns=True + ) snowflake.swap_tables(target_schema, table_name) # Get bookmark @@ -120,7 +129,8 @@ def main_impl(): start_time = datetime.now() # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -128,17 +138,26 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Start loading tables in parallel in spawning processes by # utilising all available Pool size with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -149,8 +168,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), - str(table_sync_excs), pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/logger.py b/pipelinewise/logger.py index 2a1a95cb6..2dd32e64a 100644 --- a/pipelinewise/logger.py +++ b/pipelinewise/logger.py @@ -4,6 +4,7 @@ from logging.config import fileConfig from pathlib import Path + # pylint: disable=too-few-public-methods class Logger: """PipelineWise logger class""" diff --git a/pipelinewise/utils.py b/pipelinewise/utils.py index 3746d1d09..1f3a8d6e1 100644 --- a/pipelinewise/utils.py +++ b/pipelinewise/utils.py @@ -4,7 +4,9 @@ from typing import Optional -def safe_column_name(name: Optional[str], quote_character: Optional[str]=None) -> Optional[str]: +def safe_column_name( + name: Optional[str], quote_character: Optional[str] = None +) -> Optional[str]: """ Makes column name safe by capitalizing and wrapping it in double quotes Args: diff --git a/setup.py b/setup.py index 9d23d4026..506f4482c 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ ], extras_require={ 'test': [ + 'flake8==3.9.2', 'pytest==6.2.4', 'pytest-dependency==0.4.0', 'pytest-cov==2.12.1', diff --git a/tests/end_to_end/helpers/assertions.py b/tests/end_to_end/helpers/assertions.py index 810129cde..470bf6a7a 100644 --- a/tests/end_to_end/helpers/assertions.py +++ b/tests/end_to_end/helpers/assertions.py @@ -26,7 +26,9 @@ def assert_run_tap_success(tap, target, sync_engines, profiling=False): assert_state_file_valid(target, tap, log_file) if profiling: - assert_profiling_stats_files_created(stdout, 'run_tap', sync_engines, tap, target) + assert_profiling_stats_files_created( + stdout, 'run_tap', sync_engines, tap, target + ) def assert_resync_tables_success(tap, target, profiling=False): @@ -45,7 +47,9 @@ def assert_resync_tables_success(tap, target, profiling=False): assert_state_file_valid(target, tap, log_file) if profiling: - assert_profiling_stats_files_created(stdout, 'sync_tables', ['fastsync'], tap, target) + assert_profiling_stats_files_created( + stdout, 'sync_tables', ['fastsync'], tap, target + ) def assert_command_success(return_code, stdout, stderr, log_path=None): @@ -74,7 +78,9 @@ def assert_command_success(return_code, stdout, stderr, log_path=None): def assert_state_file_valid(target_name, tap_name, log_path=None): """Assert helper function to check if state file exists for a certain tap for a certain target""" - state_file = Path(f'{Path.home()}/.pipelinewise/{target_name}/{tap_name}/state.json').resolve() + state_file = Path( + f'{Path.home()}/.pipelinewise/{target_name}/{tap_name}/state.json' + ).resolve() assert os.path.isfile(state_file) # Check if state file content equals to last emitted state in log @@ -82,7 +88,10 @@ def assert_state_file_valid(target_name, tap_name, log_path=None): success_log_path = f'{log_path}.success' state_in_log = None with open(success_log_path, 'r', encoding='utf-8') as log_f: - state_log_pattern = re.search(r'\nINFO STATE emitted from target: (.+\n)', '\n'.join(log_f.readlines())) + state_log_pattern = re.search( + r'\nINFO STATE emitted from target: (.+\n)', + '\n'.join(log_f.readlines()), + ) if state_log_pattern: state_in_log = state_log_pattern.groups()[-1] @@ -92,7 +101,9 @@ def assert_state_file_valid(target_name, tap_name, log_path=None): assert state_in_log == ''.join(state_f.readlines()) -def assert_cols_in_table(query_runner_fn: callable, table_schema: str, table_name: str, columns: List[str]): +def assert_cols_in_table( + query_runner_fn: callable, table_schema: str, table_name: str, columns: List[str] +): """Fetches the given table's columns from information_schema and tests if every given column is in the result @@ -102,14 +113,20 @@ def assert_cols_in_table(query_runner_fn: callable, table_schema: str, table_nam :param columns: list of columns to check if there are in the table's columns """ funcs = _map_tap_to_target_functions(None, query_runner_fn) - sql_get_columns_for_table_fn = funcs.get('target_sql_get_table_cols_fn', db.sql_get_columns_for_table) + sql_get_columns_for_table_fn = funcs.get( + 'target_sql_get_table_cols_fn', db.sql_get_columns_for_table + ) sql = sql_get_columns_for_table_fn(table_schema, table_name) result = query_runner_fn(sql) cols = [res[0] for res in result] try: assert all(col in cols for col in columns) except AssertionError as ex: - ex.args += ('Error', columns, f'One ore more columns not found in target table {table_name}') + ex.args += ( + 'Error', + columns, + f'One ore more columns not found in target table {table_name}', + ) raise @@ -118,7 +135,9 @@ def _run_sql(query_runner_fn: callable, sql_query: str) -> List: return list(query_runner_fn(sql_query)) -def _map_tap_to_target_functions(tap_query_runner_fn: callable, target_query_runner_fn: callable) -> dict: +def _map_tap_to_target_functions( + tap_query_runner_fn: callable, target_query_runner_fn: callable +) -> dict: """Takes two query runner methods and creates a map with the compatible database specific functions that required to run assertions. @@ -132,19 +151,19 @@ def _map_tap_to_target_functions(tap_query_runner_fn: callable, target_query_run 'source_schemas': ['mysql_source_db'], 'target_schemas': ['ppw_e2e_tap_mysql'], 'source_sql_get_cols_fn': db.sql_get_columns_mysql, - 'source_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_mysql + 'source_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_mysql, }, # tap-postgres specific attributes and functions 'run_query_tap_postgres': { 'source_schemas': ['public', 'public2'], 'target_schemas': ['ppw_e2e_tap_postgres', 'ppw_e2e_tap_postgres_public2'], 'source_sql_get_cols_fn': db.sql_get_columns_postgres, - 'source_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_postgres + 'source_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_postgres, }, # target-postgres specific attributes and functions 'run_query_target_postgres': { 'target_sql_get_cols_fn': db.sql_get_columns_postgres, - 'target_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_postgres + 'target_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_postgres, }, # target-snowflake specific attributes and functions 'run_query_target_snowflake': { @@ -162,16 +181,21 @@ def _map_tap_to_target_functions(tap_query_runner_fn: callable, target_query_run 'run_query_target_redshift': { 'target_sql_get_cols_fn': db.sql_get_columns_redshift, 'target_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_redshift, - } + }, } # Merge the keys into one dict by tap and target query runner names if tap_query_runner_fn: - return {**f_map[tap_query_runner_fn.__name__], **f_map[target_query_runner_fn.__name__]} + return { + **f_map[tap_query_runner_fn.__name__], + **f_map[target_query_runner_fn.__name__], + } return {**f_map[target_query_runner_fn.__name__]} -def assert_row_counts_equal(tap_query_runner_fn: callable, target_query_runner_fn: callable) -> None: +def assert_row_counts_equal( + tap_query_runner_fn: callable, target_query_runner_fn: callable +) -> None: """Takes two query runner methods, counts the row numbers in every table in both the source and target databases and tests if the row counts are matching. @@ -185,12 +209,20 @@ def assert_row_counts_equal(tap_query_runner_fn: callable, target_query_runner_f target_schemas = funcs['target_schemas'] # Generate a dynamic SQLs to count rows in source and target databases - source_dynamic_sql_row_count = funcs['source_sql_dynamic_row_count_fn'](source_schemas) - target_dynamic_sql_row_count = funcs['target_sql_dynamic_row_count_fn'](target_schemas) + source_dynamic_sql_row_count = funcs['source_sql_dynamic_row_count_fn']( + source_schemas + ) + target_dynamic_sql_row_count = funcs['target_sql_dynamic_row_count_fn']( + target_schemas + ) # Count rows - source_sql_row_count = _run_sql(tap_query_runner_fn, source_dynamic_sql_row_count)[0][0] - target_sql_row_count = _run_sql(target_query_runner_fn, target_dynamic_sql_row_count)[0][0] + source_sql_row_count = _run_sql(tap_query_runner_fn, source_dynamic_sql_row_count)[ + 0 + ][0] + target_sql_row_count = _run_sql( + target_query_runner_fn, target_dynamic_sql_row_count + )[0][0] # Run the generated SQLs row_counts_in_source = _run_sql(tap_query_runner_fn, source_sql_row_count) @@ -200,11 +232,8 @@ def assert_row_counts_equal(tap_query_runner_fn: callable, target_query_runner_f # we fix that by renaming the source tables to names that the target would accept if 'target_sql_safe_name_fn' in funcs: row_counts_in_source = [ - ( - funcs['target_sql_safe_name_fn'](table), - row_count - ) - for (table,row_count) in row_counts_in_source + (funcs['target_sql_safe_name_fn'](table), row_count) + for (table, row_count) in row_counts_in_source ] # Compare the two dataset @@ -212,10 +241,12 @@ def assert_row_counts_equal(tap_query_runner_fn: callable, target_query_runner_f # pylint: disable=too-many-locals -def assert_all_columns_exist(tap_query_runner_fn: callable, - target_query_runner_fn: callable, - column_type_mapper_fn: callable = None, - ignore_cols: Union[Set, List] = None) -> None: +def assert_all_columns_exist( + tap_query_runner_fn: callable, + target_query_runner_fn: callable, + column_type_mapper_fn: callable = None, + ignore_cols: Union[Set, List] = None, +) -> None: """Takes two query runner methods, gets the columns list for every table in both the source and target database and tests if every column in source exists in the target database. Some taps have unsupported column types and these are not part of the schemas published to the target thus @@ -251,10 +282,7 @@ def _cols_list_to_dict(cols: List) -> dict: cols_dict = {} for col in cols: col_props = col.split(':') - cols_dict[col_props[0]] = { - 'type': col_props[1], - 'type_extra': col_props[2] - } + cols_dict[col_props[0]] = {'type': col_props[1], 'type_extra': col_props[2]} return cols_dict @@ -270,7 +298,11 @@ def _cols_list_to_dict(cols: List) -> dict: source_cols = table_cols[1].lower().split(';') try: - target_cols = next(t[1] for t in target_table_cols if t[0].lower() == table_to_check).lower().split(';') + target_cols = ( + next(t[1] for t in target_table_cols if t[0].lower() == table_to_check) + .lower() + .split(';') + ) except StopIteration as ex: ex.args += ('Error', f'{table_to_check} table not found in target') raise @@ -287,25 +319,38 @@ def _cols_list_to_dict(cols: List) -> dict: try: assert col_name in target_cols_dict except AssertionError as ex: - ex.args += ('Error', f'{col_name} column not found in target table {table_to_check}') + ex.args += ( + 'Error', + f'{col_name} column not found in target table {table_to_check}', + ) raise # Check if column type is expected in the target table, if mapper function provided if column_type_mapper_fn: try: target_col = target_cols_dict[col_name] - exp_col_type = column_type_mapper_fn(col_props['type'], col_props['type_extra']) \ - .replace(' NULL', '').lower() + exp_col_type = ( + column_type_mapper_fn( + col_props['type'], col_props['type_extra'] + ) + .replace(' NULL', '') + .lower() + ) act_col_type = target_col['type'].lower() assert act_col_type == exp_col_type except AssertionError as ex: - ex.args += ('Error', f'{col_name} column type is not as expected. ' - f'Expected: {exp_col_type} ' - f'Actual: {act_col_type}') + ex.args += ( + 'Error', + f'{col_name} column type is not as expected. ' + f'Expected: {exp_col_type} ' + f'Actual: {act_col_type}', + ) raise -def assert_date_column_naive_in_target(target_query_runner_fn, column_name, full_table_name): +def assert_date_column_naive_in_target( + target_query_runner_fn, column_name, full_table_name +): """ Checks if all dates in the given column are naive,i.e no timezone Args: @@ -313,19 +358,20 @@ def assert_date_column_naive_in_target(target_query_runner_fn, column_name, full column_name: column of timestamp type full_table_name: fully qualified table name """ - dates = target_query_runner_fn( - f'SELECT {column_name} FROM {full_table_name};') + dates = target_query_runner_fn(f'SELECT {column_name} FROM {full_table_name};') for date in dates: if date[0] is not None: assert date[0].tzinfo is None -def assert_profiling_stats_files_created(stdout: str, - command: str, - sync_engines: List = None, - tap: Union[str, List[str]] = None, - target: str = None): +def assert_profiling_stats_files_created( + stdout: str, + command: str, + sync_engines: List = None, + tap: Union[str, List[str]] = None, + target: str = None, +): """ Asserts that profiling pstat files were created by checking their existence Args: @@ -339,7 +385,10 @@ def assert_profiling_stats_files_created(stdout: str, profiler_dir = tasks.find_profiling_folder(stdout) # crawl the folder looking for pstat files and strip the folder name from the file name - pstat_files = {file[len(f'{profiler_dir}/'):] for file in glob.iglob(f'{profiler_dir}/*.pstat')} + pstat_files = { + file[len(f'{profiler_dir}/'):] + for file in glob.iglob(f'{profiler_dir}/*.pstat') + } assert f'pipelinewise_{command}.pstat' in pstat_files diff --git a/tests/end_to_end/helpers/db.py b/tests/end_to_end/helpers/db.py index be363efe1..bd7ef8200 100644 --- a/tests/end_to_end/helpers/db.py +++ b/tests/end_to_end/helpers/db.py @@ -11,15 +11,14 @@ from pipelinewise.fastsync.commons.target_bigquery import safe_name + # pylint: disable=too-many-arguments def run_query_postgres(query, host, port, user, password, database): """Run and SQL query in a postgres database""" result_rows = [] - with psycopg2.connect(host=host, - port=port, - user=user, - password=password, - database=database) as conn: + with psycopg2.connect( + host=host, port=port, user=user, password=password, database=database + ) as conn: conn.set_session(autocommit=True) with conn.cursor() as cur: cur.execute(query) @@ -31,13 +30,15 @@ def run_query_postgres(query, host, port, user, password, database): def run_query_mysql(query, host, port, user, password, database): """Run and SQL query in a mysql database""" result_rows = [] - with pymysql.connect(host=host, - port=port, - user=user, - password=password, - database=database, - charset='utf8mb4', - cursorclass=pymysql.cursors.Cursor) as cur: + with pymysql.connect( + host=host, + port=port, + user=user, + password=password, + database=database, + charset='utf8mb4', + cursorclass=pymysql.cursors.Cursor, + ) as cur: cur.execute(query) if cur.rowcount > 0: result_rows = cur.fetchall() @@ -47,12 +48,14 @@ def run_query_mysql(query, host, port, user, password, database): def run_query_snowflake(query, account, database, warehouse, user, password): """Run and SQL query in a snowflake database""" result_rows = [] - with snowflake.connector.connect(account=account, - database=database, - warehouse=warehouse, - user=user, - password=password, - autocommit=True) as conn: + with snowflake.connector.connect( + account=account, + database=database, + warehouse=warehouse, + user=user, + password=password, + autocommit=True, + ) as conn: with conn.cursor() as cur: cur.execute(query) if cur.rowcount > 0: @@ -70,6 +73,7 @@ def delete_dataset_bigquery(dataset, project): client = bigquery.Client(project=project) client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) + def run_query_bigquery(query, project): """Run and SQL query in a BigQuery database""" client = bigquery.Client(project=project) @@ -77,6 +81,7 @@ def run_query_bigquery(query, project): query_job.result() return [r.values() for r in query_job] + def run_query_redshift(query, host, port, user, password, database): """Redshift is compatible with postgres""" return run_query_postgres(query, host, port, user, password, database) @@ -152,9 +157,12 @@ def sql_get_columns_snowflake(schemas: list) -> str: def sql_get_columns_bigquery(schemas: list) -> str: """Generates an SQL command that gives the list of columns of every table in a specific schema from a snowflake database""" - table_queries = ' UNION ALL '.join(f""" + table_queries = ' UNION ALL '.join( + f""" SELECT table_name, column_name, data_type - FROM `{schema}`.INFORMATION_SCHEMA.COLUMNS""" for schema in schemas) + FROM `{schema}`.INFORMATION_SCHEMA.COLUMNS""" + for schema in schemas + ) return f""" SELECT table_name, STRING_AGG(CONCAT(column_name, ':', data_type, ':'), ';' ORDER BY column_name) @@ -254,10 +262,13 @@ def sql_dynamic_row_count_snowflake(schemas: list) -> str: def sql_dynamic_row_count_bigquery(schemas: list) -> str: """Generates an SQL statement that counts the number of rows in every table in a specific schema(s) in a Snowflake database""" - table_queries = ' UNION DISTINCT '.join(f""" + table_queries = ' UNION DISTINCT '.join( + f""" SELECT table_schema, table_name FROM `{schema}`.INFORMATION_SCHEMA.TABLES - WHERE table_type = 'BASE TABLE'""" for schema in schemas) + WHERE table_type = 'BASE TABLE'""" + for schema in schemas + ) return f""" WITH table_list AS ({table_queries}) @@ -286,22 +297,26 @@ def sql_dynamic_row_count_redshift(schemas: list) -> str: ' UNION ') WITHIN GROUP ( ORDER BY tablename ) || 'ORDER BY tbl' FROM table_list - """ + """ # noqa: E501 -def get_mongodb_connection(host: str, - port: Union[str, int], - user: str, - password: str, - database: str, - auth_database: str)->Database: +def get_mongodb_connection( + host: str, + port: Union[str, int], + user: str, + password: str, + database: str, + auth_database: str, +) -> Database: """ Creates a mongoDB connection to the db to sync from Returns: Database instance with established connection """ - return pymongo.MongoClient(host=host, - port=int(port), - username=user, - password=password, - authSource=auth_database)[database] + return pymongo.MongoClient( + host=host, + port=int(port), + username=user, + password=password, + authSource=auth_database, + )[database] diff --git a/tests/end_to_end/helpers/env.py b/tests/end_to_end/helpers/env.py index a4a88e818..9327bff73 100644 --- a/tests/end_to_end/helpers/env.py +++ b/tests/end_to_end/helpers/env.py @@ -43,7 +43,9 @@ def _load_env(self): If optional connector properties are not defined in ../../../dev/project/.env then the related test cases will be skipped.""" - load_dotenv(dotenv_path=os.path.join(DIR, '..', '..', '..', 'dev-project', '.env')) + load_dotenv( + dotenv_path=os.path.join(DIR, '..', '..', '..', 'dev-project', '.env') + ) self.env = { # ------------------------------------------------------------------ # Tap Postgres is a REQUIRED test connector and test database with test data available @@ -52,12 +54,27 @@ def _load_env(self): 'TAP_POSTGRES': { 'template_patterns': ['tap_postgres'], 'vars': { - 'HOST' : {'value': os.environ.get('TAP_POSTGRES_HOST'), 'required': True}, - 'PORT' : {'value': os.environ.get('TAP_POSTGRES_PORT'), 'required': True}, - 'USER' : {'value': os.environ.get('TAP_POSTGRES_USER'), 'required': True}, - 'PASSWORD' : {'value': os.environ.get('TAP_POSTGRES_PASSWORD'), 'required': True}, - 'DB' : {'value': os.environ.get('TAP_POSTGRES_DB'), 'required': True} - } + 'HOST': { + 'value': os.environ.get('TAP_POSTGRES_HOST'), + 'required': True, + }, + 'PORT': { + 'value': os.environ.get('TAP_POSTGRES_PORT'), + 'required': True, + }, + 'USER': { + 'value': os.environ.get('TAP_POSTGRES_USER'), + 'required': True, + }, + 'PASSWORD': { + 'value': os.environ.get('TAP_POSTGRES_PASSWORD'), + 'required': True, + }, + 'DB': { + 'value': os.environ.get('TAP_POSTGRES_DB'), + 'required': True, + }, + }, }, # ------------------------------------------------------------------ # Tap MySQL is a REQUIRED test connector and test database with test data available @@ -66,12 +83,12 @@ def _load_env(self): 'TAP_MYSQL': { 'template_patterns': ['tap_mysql'], 'vars': { - 'HOST' : {'value': os.environ.get('TAP_MYSQL_HOST')}, - 'PORT' : {'value': os.environ.get('TAP_MYSQL_PORT')}, - 'USER' : {'value': os.environ.get('TAP_MYSQL_USER')}, - 'PASSWORD' : {'value': os.environ.get('TAP_MYSQL_PASSWORD')}, - 'DB' : {'value': os.environ.get('TAP_MYSQL_DB')}, - } + 'HOST': {'value': os.environ.get('TAP_MYSQL_HOST')}, + 'PORT': {'value': os.environ.get('TAP_MYSQL_PORT')}, + 'USER': {'value': os.environ.get('TAP_MYSQL_USER')}, + 'PASSWORD': {'value': os.environ.get('TAP_MYSQL_PASSWORD')}, + 'DB': {'value': os.environ.get('TAP_MYSQL_DB')}, + }, }, # ------------------------------------------------------------------ # Tap MongoDB is a REQUIRED test connector and test database with test data available @@ -80,13 +97,25 @@ def _load_env(self): 'TAP_MONGODB': { 'template_patterns': ['tap_postgres'], 'vars': { - 'HOST': {'value': os.environ.get('TAP_MONGODB_HOST'), 'required': True}, - 'PORT': {'value': os.environ.get('TAP_MONGODB_PORT'), 'required': True}, - 'USER': {'value': os.environ.get('TAP_MONGODB_USER'), 'required': True}, - 'PASSWORD': {'value': os.environ.get('TAP_MONGODB_PASSWORD'), 'required': True}, + 'HOST': { + 'value': os.environ.get('TAP_MONGODB_HOST'), + 'required': True, + }, + 'PORT': { + 'value': os.environ.get('TAP_MONGODB_PORT'), + 'required': True, + }, + 'USER': { + 'value': os.environ.get('TAP_MONGODB_USER'), + 'required': True, + }, + 'PASSWORD': { + 'value': os.environ.get('TAP_MONGODB_PASSWORD'), + 'required': True, + }, 'DB': {'value': os.environ.get('TAP_MONGODB_DB'), 'required': True}, - 'AUTH_DB': {'value': 'admin', 'required': True} - } + 'AUTH_DB': {'value': 'admin', 'required': True}, + }, }, # ------------------------------------------------------------------ # Tap S3 CSV is an OPTIONAL test connector and it requires credentials to a real S3 bucket. @@ -96,10 +125,12 @@ def _load_env(self): 'optional': True, 'template_patterns': ['tap_s3_csv'], 'vars': { - 'AWS_KEY' : {'value': os.environ.get('TAP_S3_CSV_AWS_KEY')}, - 'AWS_SECRET_ACCESS_KEY' : {'value': os.environ.get('TAP_S3_CSV_AWS_SECRET_ACCESS_KEY')}, - 'BUCKET' : {'value': os.environ.get('TAP_S3_CSV_BUCKET')}, - } + 'AWS_KEY': {'value': os.environ.get('TAP_S3_CSV_AWS_KEY')}, + 'AWS_SECRET_ACCESS_KEY': { + 'value': os.environ.get('TAP_S3_CSV_AWS_SECRET_ACCESS_KEY') + }, + 'BUCKET': {'value': os.environ.get('TAP_S3_CSV_BUCKET')}, + }, }, # ------------------------------------------------------------------ # Target Postgres is a REQUIRED test connector and test database available in the docker environment @@ -107,12 +138,12 @@ def _load_env(self): 'TARGET_POSTGRES': { 'template_patterns': ['target_postgres', 'to_pg'], 'vars': { - 'HOST' : {'value': os.environ.get('TARGET_POSTGRES_HOST')}, - 'PORT' : {'value': os.environ.get('TARGET_POSTGRES_PORT')}, - 'USER' : {'value': os.environ.get('TARGET_POSTGRES_USER')}, - 'PASSWORD' : {'value': os.environ.get('TARGET_POSTGRES_PASSWORD')}, - 'DB' : {'value': os.environ.get('TARGET_POSTGRES_DB')}, - } + 'HOST': {'value': os.environ.get('TARGET_POSTGRES_HOST')}, + 'PORT': {'value': os.environ.get('TARGET_POSTGRES_PORT')}, + 'USER': {'value': os.environ.get('TARGET_POSTGRES_USER')}, + 'PASSWORD': {'value': os.environ.get('TARGET_POSTGRES_PASSWORD')}, + 'DB': {'value': os.environ.get('TARGET_POSTGRES_DB')}, + }, }, # ------------------------------------------------------------------ # Target Snowflake is an OPTIONAL test connector because it's not open sourced and not part of @@ -123,26 +154,48 @@ def _load_env(self): 'optional': True, 'template_patterns': ['target_snowflake', 'to_sf'], 'vars': { - 'ACCOUNT' : {'value': os.environ.get('TARGET_SNOWFLAKE_ACCOUNT')}, - 'DBNAME' : {'value': os.environ.get('TARGET_SNOWFLAKE_DBNAME')}, - 'USER' : {'value': os.environ.get('TARGET_SNOWFLAKE_USER')}, - 'PASSWORD' : {'value': os.environ.get('TARGET_SNOWFLAKE_PASSWORD')}, - 'WAREHOUSE' : {'value': os.environ.get('TARGET_SNOWFLAKE_WAREHOUSE')}, - 'AWS_ACCESS_KEY' : {'value': os.environ.get('TARGET_SNOWFLAKE_AWS_ACCESS_KEY'), - 'optional': True}, - 'AWS_SECRET_ACCESS_KEY' : {'value': os.environ.get('TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY'), - 'optional': True}, - 'SESSION_TOKEN' : {'value': os.environ.get('TARGET_SNOWFLAKE_SESSION_TOKEN'), - 'optional': True}, - 'S3_BUCKET' : {'value': os.environ.get('TARGET_SNOWFLAKE_S3_BUCKET')}, - 'S3_KEY_PREFIX' : {'value': os.environ.get('TARGET_SNOWFLAKE_S3_KEY_PREFIX')}, - 'S3_ACL' : {'value': os.environ.get('TARGET_SNOWFLAKE_S3_ACL'), 'optional': True}, - 'STAGE' : {'value': os.environ.get('TARGET_SNOWFLAKE_STAGE')}, - 'FILE_FORMAT' : {'value': os.environ.get('TARGET_SNOWFLAKE_FILE_FORMAT')}, - 'CLIENT_SIDE_ENCRYPTION_MASTER_KEY': - {'value': os.environ.get('TARGET_SNOWFLAKE_CLIENT_SIDE_ENCRYPTION_MASTER_KEY'), - 'optional': True}, - } + 'ACCOUNT': {'value': os.environ.get('TARGET_SNOWFLAKE_ACCOUNT')}, + 'DBNAME': {'value': os.environ.get('TARGET_SNOWFLAKE_DBNAME')}, + 'USER': {'value': os.environ.get('TARGET_SNOWFLAKE_USER')}, + 'PASSWORD': {'value': os.environ.get('TARGET_SNOWFLAKE_PASSWORD')}, + 'WAREHOUSE': { + 'value': os.environ.get('TARGET_SNOWFLAKE_WAREHOUSE') + }, + 'AWS_ACCESS_KEY': { + 'value': os.environ.get('TARGET_SNOWFLAKE_AWS_ACCESS_KEY'), + 'optional': True, + }, + 'AWS_SECRET_ACCESS_KEY': { + 'value': os.environ.get( + 'TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY' + ), + 'optional': True, + }, + 'SESSION_TOKEN': { + 'value': os.environ.get('TARGET_SNOWFLAKE_SESSION_TOKEN'), + 'optional': True, + }, + 'S3_BUCKET': { + 'value': os.environ.get('TARGET_SNOWFLAKE_S3_BUCKET') + }, + 'S3_KEY_PREFIX': { + 'value': os.environ.get('TARGET_SNOWFLAKE_S3_KEY_PREFIX') + }, + 'S3_ACL': { + 'value': os.environ.get('TARGET_SNOWFLAKE_S3_ACL'), + 'optional': True, + }, + 'STAGE': {'value': os.environ.get('TARGET_SNOWFLAKE_STAGE')}, + 'FILE_FORMAT': { + 'value': os.environ.get('TARGET_SNOWFLAKE_FILE_FORMAT') + }, + 'CLIENT_SIDE_ENCRYPTION_MASTER_KEY': { + 'value': os.environ.get( + 'TARGET_SNOWFLAKE_CLIENT_SIDE_ENCRYPTION_MASTER_KEY' + ), + 'optional': True, + }, + }, }, # ------------------------------------------------------------------ # Target BigQuery is an OPTIONAL test connector because it's not open sourced and not part of @@ -153,8 +206,8 @@ def _load_env(self): 'optional': True, 'template_patterns': ['target_bigquery', 'to_bq'], 'vars': { - 'PROJECT' : {'value': os.environ.get('TARGET_BIGQUERY_PROJECT')}, - } + 'PROJECT': {'value': os.environ.get('TARGET_BIGQUERY_PROJECT')}, + }, }, # ------------------------------------------------------------------ # Target Redshift is an OPTIONAL test connector because it's not open sourced and not part of @@ -165,36 +218,65 @@ def _load_env(self): 'optional': True, 'template_patterns': ['target_redshift', 'to_rs'], 'vars': { - 'HOST' : {'value': os.environ.get('TARGET_REDSHIFT_HOST')}, - 'PORT' : {'value': os.environ.get('TARGET_REDSHIFT_PORT')}, - 'USER' : {'value': os.environ.get('TARGET_REDSHIFT_USER')}, - 'PASSWORD' : {'value': os.environ.get('TARGET_REDSHIFT_PASSWORD')}, - 'DBNAME' : {'value': os.environ.get('TARGET_REDSHIFT_DBNAME')}, - 'AWS_ACCESS_KEY' : {'value': os.environ.get('TARGET_REDSHIFT_AWS_ACCESS_KEY'), - 'optional': True}, - 'AWS_SECRET_ACCESS_KEY' : {'value': os.environ.get('TARGET_REDSHIFT_AWS_SECRET_ACCESS_KEY'), - 'optional': True}, - 'SESSION_TOKEN' : {'value': os.environ.get('TARGET_REDSHIFT_SESSION_TOKEN'), - 'optional': True}, - 'COPY_ROLE_ARN' : {'value': os.environ.get('TARGET_REDSHIFT_COPY_ROLE_ARN'), - 'optional': True}, - 'S3_BUCKET' : {'value': os.environ.get('TARGET_REDSHIFT_S3_BUCKET')}, - 'S3_KEY_PREFIX' : {'value': os.environ.get('TARGET_REDSHIFT_S3_KEY_PREFIX')}, - 'S3_ACL' : {'value': os.environ.get('TARGET_REDSHIFT_S3_ACL'), 'optional': True} - } - } + 'HOST': {'value': os.environ.get('TARGET_REDSHIFT_HOST')}, + 'PORT': {'value': os.environ.get('TARGET_REDSHIFT_PORT')}, + 'USER': {'value': os.environ.get('TARGET_REDSHIFT_USER')}, + 'PASSWORD': {'value': os.environ.get('TARGET_REDSHIFT_PASSWORD')}, + 'DBNAME': {'value': os.environ.get('TARGET_REDSHIFT_DBNAME')}, + 'AWS_ACCESS_KEY': { + 'value': os.environ.get('TARGET_REDSHIFT_AWS_ACCESS_KEY'), + 'optional': True, + }, + 'AWS_SECRET_ACCESS_KEY': { + 'value': os.environ.get( + 'TARGET_REDSHIFT_AWS_SECRET_ACCESS_KEY' + ), + 'optional': True, + }, + 'SESSION_TOKEN': { + 'value': os.environ.get('TARGET_REDSHIFT_SESSION_TOKEN'), + 'optional': True, + }, + 'COPY_ROLE_ARN': { + 'value': os.environ.get('TARGET_REDSHIFT_COPY_ROLE_ARN'), + 'optional': True, + }, + 'S3_BUCKET': {'value': os.environ.get('TARGET_REDSHIFT_S3_BUCKET')}, + 'S3_KEY_PREFIX': { + 'value': os.environ.get('TARGET_REDSHIFT_S3_KEY_PREFIX') + }, + 'S3_ACL': { + 'value': os.environ.get('TARGET_REDSHIFT_S3_ACL'), + 'optional': True, + }, + }, + }, } # Add is_configured keys for every connector # Useful to skip certain test cases dynamically when specific tap # or target database is not configured - self.env['TAP_POSTGRES']['is_configured'] = self._is_env_connector_configured('TAP_POSTGRES') - self.env['TAP_MYSQL']['is_configured'] = self._is_env_connector_configured('TAP_MYSQL') - self.env['TAP_S3_CSV']['is_configured'] = self._is_env_connector_configured('TAP_S3_CSV') - self.env['TARGET_POSTGRES']['is_configured'] = self._is_env_connector_configured('TARGET_POSTGRES') - self.env['TARGET_REDSHIFT']['is_configured'] = self._is_env_connector_configured('TARGET_REDSHIFT') - self.env['TARGET_SNOWFLAKE']['is_configured'] = self._is_env_connector_configured('TARGET_SNOWFLAKE') - self.env['TARGET_BIGQUERY']['is_configured'] = self._is_env_connector_configured('TARGET_BIGQUERY') + self.env['TAP_POSTGRES']['is_configured'] = self._is_env_connector_configured( + 'TAP_POSTGRES' + ) + self.env['TAP_MYSQL']['is_configured'] = self._is_env_connector_configured( + 'TAP_MYSQL' + ) + self.env['TAP_S3_CSV']['is_configured'] = self._is_env_connector_configured( + 'TAP_S3_CSV' + ) + self.env['TARGET_POSTGRES'][ + 'is_configured' + ] = self._is_env_connector_configured('TARGET_POSTGRES') + self.env['TARGET_REDSHIFT'][ + 'is_configured' + ] = self._is_env_connector_configured('TARGET_REDSHIFT') + self.env['TARGET_SNOWFLAKE'][ + 'is_configured' + ] = self._is_env_connector_configured('TARGET_SNOWFLAKE') + self.env['TARGET_BIGQUERY'][ + 'is_configured' + ] = self._is_env_connector_configured('TARGET_BIGQUERY') def _get_conn_env_var(self, connector, key): """Get the value of a specific variable in the self.env dict""" @@ -209,14 +291,14 @@ def get_aws_session(self): aws_secret_access_key = os.environ.get('TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY') if aws_access_key_id is None or aws_secret_access_key is None: raise Exception( - 'Env vars TARGET_SNOWFLAKE_AWS_ACCESS_KEY and TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY are required') + 'Env vars TARGET_SNOWFLAKE_AWS_ACCESS_KEY and TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY are required' + ) return boto3.session.Session( aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key + aws_secret_access_key=aws_secret_access_key, ) - def _is_env_connector_configured(self, env_connector): """Detect if certain component(s) of env vars group is configured properly""" env_conns = [] @@ -235,7 +317,9 @@ def _is_env_connector_configured(self, env_connector): if self.env[env_conn].get('optional'): return False # Value not defined but it's a required property - raise Exception(f'{env_conn}_{key} env var is required but not defined.') + raise Exception( + f'{env_conn}_{key} env var is required but not defined.' + ) return True def _find_env_conn_by_template_name(self, template_name): @@ -290,7 +374,9 @@ def _init_test_project_dir(self, project_dir): if is_configured: template_vars = set(re.findall(r'\$\{(.+?)\}', yaml)) for var in template_vars: - yaml = yaml.replace(f'${{{var}}}', self._all_env_vars_to_dict().get(var)) + yaml = yaml.replace( + f'${{{var}}}', self._all_env_vars_to_dict().get(var) + ) # Write the template replaced YAML file with open(yaml_path, 'w+', encoding='utf-8') as f_render: @@ -314,40 +400,47 @@ def _run_command(args): def run_query_tap_postgres(self, query): """Run and SQL query in tap postgres database""" - return db.run_query_postgres(query, - host=self._get_conn_env_var('TAP_POSTGRES', 'HOST'), - port=self._get_conn_env_var('TAP_POSTGRES', 'PORT'), - user=self._get_conn_env_var('TAP_POSTGRES', 'USER'), - password=self._get_conn_env_var('TAP_POSTGRES', 'PASSWORD'), - database=self._get_conn_env_var('TAP_POSTGRES', 'DB')) + return db.run_query_postgres( + query, + host=self._get_conn_env_var('TAP_POSTGRES', 'HOST'), + port=self._get_conn_env_var('TAP_POSTGRES', 'PORT'), + user=self._get_conn_env_var('TAP_POSTGRES', 'USER'), + password=self._get_conn_env_var('TAP_POSTGRES', 'PASSWORD'), + database=self._get_conn_env_var('TAP_POSTGRES', 'DB'), + ) def get_tap_mongodb_connection(self): """Create and returns tap mongodb database instance to run queries on""" - return db.get_mongodb_connection(host=self._get_conn_env_var('TAP_MONGODB', 'HOST'), - port=self._get_conn_env_var('TAP_MONGODB', 'PORT'), - user=self._get_conn_env_var('TAP_MONGODB', 'USER'), - password=self._get_conn_env_var('TAP_MONGODB', 'PASSWORD'), - database=self._get_conn_env_var('TAP_MONGODB', 'DB'), - auth_database=self._get_conn_env_var('TAP_MONGODB', 'AUTH_DB'), - ) + return db.get_mongodb_connection( + host=self._get_conn_env_var('TAP_MONGODB', 'HOST'), + port=self._get_conn_env_var('TAP_MONGODB', 'PORT'), + user=self._get_conn_env_var('TAP_MONGODB', 'USER'), + password=self._get_conn_env_var('TAP_MONGODB', 'PASSWORD'), + database=self._get_conn_env_var('TAP_MONGODB', 'DB'), + auth_database=self._get_conn_env_var('TAP_MONGODB', 'AUTH_DB'), + ) def run_query_target_postgres(self, query: object) -> object: """Run and SQL query in target postgres database""" - return db.run_query_postgres(query, - host=self._get_conn_env_var('TARGET_POSTGRES', 'HOST'), - port=self._get_conn_env_var('TARGET_POSTGRES', 'PORT'), - user=self._get_conn_env_var('TARGET_POSTGRES', 'USER'), - password=self._get_conn_env_var('TARGET_POSTGRES', 'PASSWORD'), - database=self._get_conn_env_var('TARGET_POSTGRES', 'DB')) + return db.run_query_postgres( + query, + host=self._get_conn_env_var('TARGET_POSTGRES', 'HOST'), + port=self._get_conn_env_var('TARGET_POSTGRES', 'PORT'), + user=self._get_conn_env_var('TARGET_POSTGRES', 'USER'), + password=self._get_conn_env_var('TARGET_POSTGRES', 'PASSWORD'), + database=self._get_conn_env_var('TARGET_POSTGRES', 'DB'), + ) def run_query_target_redshift(self, query): """Run an SQL query in target redshift database""" - return db.run_query_redshift(query, - host=self._get_conn_env_var('TARGET_REDSHIFT', 'HOST'), - port=self._get_conn_env_var('TARGET_REDSHIFT', 'PORT'), - user=self._get_conn_env_var('TARGET_REDSHIFT', 'USER'), - password=self._get_conn_env_var('TARGET_REDSHIFT', 'PASSWORD'), - database=self._get_conn_env_var('TARGET_REDSHIFT', 'DBNAME')) + return db.run_query_redshift( + query, + host=self._get_conn_env_var('TARGET_REDSHIFT', 'HOST'), + port=self._get_conn_env_var('TARGET_REDSHIFT', 'PORT'), + user=self._get_conn_env_var('TARGET_REDSHIFT', 'USER'), + password=self._get_conn_env_var('TARGET_REDSHIFT', 'PASSWORD'), + database=self._get_conn_env_var('TARGET_REDSHIFT', 'DBNAME'), + ) # pylint: disable=unnecessary-pass def run_query_tap_s3_csv(self, file): @@ -357,31 +450,37 @@ def run_query_tap_s3_csv(self, file): def run_query_tap_mysql(self, query): """Run and SQL query in tap mysql database""" - return db.run_query_mysql(query, - host=self._get_conn_env_var('TAP_MYSQL', 'HOST'), - port=int(self._get_conn_env_var('TAP_MYSQL', 'PORT')), - user=self._get_conn_env_var('TAP_MYSQL', 'USER'), - password=self._get_conn_env_var('TAP_MYSQL', 'PASSWORD'), - database=self._get_conn_env_var('TAP_MYSQL', 'DB')) + return db.run_query_mysql( + query, + host=self._get_conn_env_var('TAP_MYSQL', 'HOST'), + port=int(self._get_conn_env_var('TAP_MYSQL', 'PORT')), + user=self._get_conn_env_var('TAP_MYSQL', 'USER'), + password=self._get_conn_env_var('TAP_MYSQL', 'PASSWORD'), + database=self._get_conn_env_var('TAP_MYSQL', 'DB'), + ) def run_query_target_snowflake(self, query): """Run and SQL query in target snowflake database""" - return db.run_query_snowflake(query, - account=self._get_conn_env_var('TARGET_SNOWFLAKE', 'ACCOUNT'), - database=self._get_conn_env_var('TARGET_SNOWFLAKE', 'DBNAME'), - warehouse=self._get_conn_env_var('TARGET_SNOWFLAKE', 'WAREHOUSE'), - user=self._get_conn_env_var('TARGET_SNOWFLAKE', 'USER'), - password=self._get_conn_env_var('TARGET_SNOWFLAKE', 'PASSWORD')) + return db.run_query_snowflake( + query, + account=self._get_conn_env_var('TARGET_SNOWFLAKE', 'ACCOUNT'), + database=self._get_conn_env_var('TARGET_SNOWFLAKE', 'DBNAME'), + warehouse=self._get_conn_env_var('TARGET_SNOWFLAKE', 'WAREHOUSE'), + user=self._get_conn_env_var('TARGET_SNOWFLAKE', 'USER'), + password=self._get_conn_env_var('TARGET_SNOWFLAKE', 'PASSWORD'), + ) def delete_dataset_target_bigquery(self, dataset): """Run and SQL query in target bigquery database""" - return db.delete_dataset_bigquery(dataset, - project=self._get_conn_env_var('TARGET_BIGQUERY', 'PROJECT')) + return db.delete_dataset_bigquery( + dataset, project=self._get_conn_env_var('TARGET_BIGQUERY', 'PROJECT') + ) def run_query_target_bigquery(self, query): """Run and SQL query in target bigquery database""" - return db.run_query_bigquery(query, - project=self._get_conn_env_var('TARGET_BIGQUERY', 'PROJECT')) + return db.run_query_bigquery( + query, project=self._get_conn_env_var('TARGET_BIGQUERY', 'PROJECT') + ) # ------------------------------------------------------------------------- # Setup methods to initialise source and target databases and to make them @@ -407,16 +506,23 @@ def setup_tap_mongodb(self): db_script = os.path.join(DIR, '..', '..', 'db', 'tap_mongodb.sh') self._run_command(db_script) - def setup_tap_s3_csv(self): """Upload test input files to S3 to be prapared for test run""" - mock_data_1 = os.path.join(DIR, '..', 'test-project', 's3_mock_data', 'mock_data_1.csv') - mock_data_2 = os.path.join(DIR, '..', 'test-project', 's3_mock_data', 'mock_data_2.csv') + mock_data_1 = os.path.join( + DIR, '..', 'test-project', 's3_mock_data', 'mock_data_1.csv' + ) + mock_data_2 = os.path.join( + DIR, '..', 'test-project', 's3_mock_data', 'mock_data_2.csv' + ) bucket = self._get_conn_env_var('TAP_S3_CSV', 'BUCKET') - s3 = boto3.client('s3', - aws_access_key_id=self._get_conn_env_var('TAP_S3_CSV', 'AWS_KEY'), - aws_secret_access_key=self._get_conn_env_var('TAP_S3_CSV', 'AWS_SECRET_ACCESS_KEY')) + s3 = boto3.client( + 's3', + aws_access_key_id=self._get_conn_env_var('TAP_S3_CSV', 'AWS_KEY'), + aws_secret_access_key=self._get_conn_env_var( + 'TAP_S3_CSV', 'AWS_SECRET_ACCESS_KEY' + ), + ) s3.upload_file(mock_data_1, bucket, 'ppw_e2e_tap_s3_csv/mock_data_1.csv') s3.upload_file(mock_data_2, bucket, 'ppw_e2e_tap_s3_csv/mock_data_2.csv') @@ -424,28 +530,56 @@ def setup_tap_s3_csv(self): def setup_target_postgres(self): """Clean postgres target database and prepare for test run""" self.run_query_target_postgres('CREATE EXTENSION IF NOT EXISTS pgcrypto') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres CASCADE') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_public2 CASCADE') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical1 CASCADE') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical2 CASCADE') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql CASCADE') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_s3_csv CASCADE') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_mongodb CASCADE') + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres CASCADE' + ) + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_public2 CASCADE' + ) + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical1 CASCADE' + ) + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical2 CASCADE' + ) + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql CASCADE' + ) + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_s3_csv CASCADE' + ) + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_mongodb CASCADE' + ) # Clean config directory shutil.rmtree(os.path.join(CONFIG_DIR, 'postgres_dwh'), ignore_errors=True) def setup_target_redshift(self): """Clean redshift target database and prepare for test run""" - self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres CASCADE') - self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_public2 CASCADE') - self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical1 CASCADE') - self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical2 CASCADE') - self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql CASCADE') - self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_tap_s3_csv CASCADE') + self.run_query_target_redshift( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres CASCADE' + ) + self.run_query_target_redshift( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_public2 CASCADE' + ) + self.run_query_target_redshift( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical1 CASCADE' + ) + self.run_query_target_redshift( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical2 CASCADE' + ) + self.run_query_target_redshift( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql CASCADE' + ) + self.run_query_target_redshift( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_s3_csv CASCADE' + ) self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_helper CASCADE') self.run_query_target_redshift('CREATE SCHEMA ppw_e2e_helper') - self.run_query_target_redshift('CREATE TABLE ppw_e2e_helper.dual (dummy VARCHAR)') + self.run_query_target_redshift( + 'CREATE TABLE ppw_e2e_helper.dual (dummy VARCHAR)' + ) self.run_query_target_redshift('INSERT INTO ppw_e2e_helper.dual VALUES (\'X\')') # Clean config directory @@ -453,13 +587,27 @@ def setup_target_redshift(self): def setup_target_snowflake(self): """Clean snowflake target database and prepare for test run""" - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres CASCADE') - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_public2 CASCADE') - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical1 CASCADE') - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical2 CASCADE') - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql CASCADE') - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_s3_csv CASCADE') - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_mongodb CASCADE') + self.run_query_target_snowflake( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres CASCADE' + ) + self.run_query_target_snowflake( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_public2 CASCADE' + ) + self.run_query_target_snowflake( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical1 CASCADE' + ) + self.run_query_target_snowflake( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical2 CASCADE' + ) + self.run_query_target_snowflake( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql CASCADE' + ) + self.run_query_target_snowflake( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_s3_csv CASCADE' + ) + self.run_query_target_snowflake( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_mongodb CASCADE' + ) # Clean config directory shutil.rmtree(os.path.join(CONFIG_DIR, 'snowflake'), ignore_errors=True) diff --git a/tests/end_to_end/helpers/tasks.py b/tests/end_to_end/helpers/tasks.py index 8717a92dd..9d8c968b3 100644 --- a/tests/end_to_end/helpers/tasks.py +++ b/tests/end_to_end/helpers/tasks.py @@ -5,7 +5,9 @@ def run_command(command): """Run shell command and return returncode, stdout and stderr""" - with subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc: + with subprocess.Popen( + shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) as proc: proc_result = proc.communicate() return_code = proc.returncode stdout = proc_result[0].decode('utf-8') @@ -16,12 +18,12 @@ def run_command(command): def find_run_tap_log_file(stdout, sync_engine=None): """Pipelinewise creates log file per running tap instances in a dynamically created directory: - ~/.pipelinewise///log + ~/.pipelinewise///log - Every log file matches the pattern: - --_