diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index b536a9737..000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,197 +0,0 @@ -version: 2 - -references: - - docker_k8s_deployer: &docker_k8s_deployer - working_directory: ~/pipelinewise - docker: - - image: docker.tw.ee/k8s-deployer:2 - - attach_workspace: &attach_workspace - attach_workspace: - at: ~/pipelinewise - - container_config: &container_config - working_directory: ~/pipelinewise - docker: - # Main python container - # Use circleci next-gen image - - image: cimg/python:3.7 - environment: - TAP_MYSQL_HOST: db_mysql_source - TAP_MYSQL_PORT: 3306 - TAP_MYSQL_ROOT_PASSWORD: test - TAP_MYSQL_USER: test - TAP_MYSQL_PASSWORD: test - TAP_MYSQL_DB: mysql_source_db - - TAP_POSTGRES_HOST: db_postgres_source - TAP_POSTGRES_PORT: 5432 - TAP_POSTGRES_USER: test - TAP_POSTGRES_PASSWORD: test - TAP_POSTGRES_DB: postgres_source_db - - TARGET_POSTGRES_HOST: db_postgres_dwh - TARGET_POSTGRES_PORT: 5432 - TARGET_POSTGRES_USER: test - TARGET_POSTGRES_PASSWORD: test - TARGET_POSTGRES_DB: postgres_dwh - - TAP_MONGODB_HOST: mongodb_source - TAP_MONGODB_PORT: 27017 - TAP_MONGODB_USER: mongoUser - TAP_MONGODB_PASSWORD: Password1 - TAP_MONGODB_DB: test - TAP_MONGODB_ROOT_USER: mongoUser - TAP_MONGODB_ROOT_PASSWORD: Password1 - - # PostgreSQL service container image used as test source database (for tap-postgres) - - image: debezium/postgres:12-alpine - name: db_postgres_source - # enable logical decoding - command: -c "wal_level=logical" -c "max_replication_slots=5" -c "max_wal_senders=5" - environment: - POSTGRES_USER: test - POSTGRES_PASSWORD: test - POSTGRES_DB: postgres_source_db - - # MariaDB service container image used as test source database (for tap-mysql) - - image: mariadb:10.2.26 - name: db_mysql_source - command: --default-authentication-plugin=mysql_native_password --log-bin=mysql-bin --binlog-format=ROW - environment: - MYSQL_ROOT_PASSWORD: test - MYSQL_USER: test - MYSQL_PASSWORD: test - MYSQL_DATABASE: mysql_source_db - - # PostgreSQL service container image used as test target (DWH) database (for target-postgres) - - image: postgres:11.4 - name: db_postgres_dwh - environment: - POSTGRES_USER: test - POSTGRES_PASSWORD: test - POSTGRES_DB: postgres_dwh - - - image: "mongo:4.2-bionic" - name: "mongodb_source" - environment: - MONGO_INITDB_ROOT_USERNAME: mongoUser - MONGO_INITDB_ROOT_PASSWORD: Password1 - MONGO_INITDB_DATABASE: test - command: [mongod, --replSet, rs0] - -jobs: - e2e_tests: - <<: *container_config - steps: - - checkout - - setup_remote_docker: - docker_layer_caching: true - - run: - when: - - name: 'Run E2E tests' - command: | - ./scripts/ci_check_no_file_changes.sh python && rc=$? || rc=$? - - if [[ $rc -eq 1 ]] - then - sudo apt-get update - sudo apt install mariadb-client postgresql-client mongo-tools mbuffer gettext-base - wget https://repo.mongodb.org/apt/ubuntu/dists/bionic/mongodb-org/4.2/multiverse/binary-amd64/mongodb-org-shell_4.2.7_amd64.deb - sudo dpkg -i ./mongodb-org-shell_4.2.7_amd64.deb && rm mongodb-org-shell_4.2.7_amd64.deb - ./dev-project/mongo/init_rs.sh - - ./tests/db/tap_mysql_db.sh - ./tests/db/tap_postgres_db.sh - ./tests/db/tap_mongodb.sh - ./tests/db/target_postgres.sh - - ./install.sh --acceptlicenses --connectors=all - - . .virtualenvs/pipelinewise/bin/activate - export PIPELINEWISE_HOME=$PWD - pytest tests/end_to_end -v - fi - no_output_timeout: 30m - - # For documentation deployment. You'll need the following environment vars - # in your Circle CI settings, otherwise this will not work. - # - # GH_NAME (your git username) - # GH_EMAIL (your git email) - # GH_TOKEN (the personal Git token with pushes enabled) - deploy-doc: - docker: - - image: cimg/python:3.7 - working_directory: ~/gh_doc_automation - steps: - - checkout - - run: - name: "Publish doc if it has changed" - command: | - ./scripts/ci_check_no_file_changes.sh doc && rc=$? || rc=$? - - if [[ $rc -eq 1 ]] - then - .circleci/publish_docs.sh - fi - - upload_docker_image: - <<: *docker_k8s_deployer - steps: - - checkout - - setup_remote_docker - - *attach_workspace - - run: - name: Push and release the new docker image to artifactory - command: k8s-deployment docker-build jfrog-cli-docker-push - environment: - DEPLOY_SLACK_CHANNEL: "#analytics-platform-builds" - DEPLOY_IMAGE_NAME: "pipelinewise" - DEPLOY_DOCKERFILE: "./Dockerfile" - DEPLOY_IMAGE_ADDITIONAL_TAGS: "latest" - - promote_docker_image: - <<: *docker_k8s_deployer - steps: - - checkout - - setup_remote_docker - - *attach_workspace - - run: - name: Promote built artifact for PipelineWise - command: k8s-deployment trigger-image-promotion - environment: - DEPLOY_IMAGE_NAME: "pipelinewise" - DEPLOY_SLACK_CHANNEL: "#analytics-platform-builds" - - -workflows: - version: 2 - - build: - jobs: - - e2e_tests - - - upload_docker_image: - context: kubernetes-staging - requires: - - e2e_tests - filters: - branches: - only: - - master - - - promote_docker_image: - context: promote-build - requires: - - upload_docker_image - filters: - branches: - only: - - master - - deploy_doc: - jobs: - - deploy-doc diff --git a/.dockerignore b/.dockerignore index 06acfc141..3276c3701 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,10 +1,29 @@ **/.git **/.virtualenvs +**/.github **/.circleci **/.pytest_cache **/__pycache__ *.egg-info *.egg/ -docs -dev-project *.rpm +**/venv +**/.venv +**/.coverage + +bin +dev-project +docs +scripts +tests +test-reports + +.coveragerc +.pre-commit-config.yaml +.gitignore +.style.yapf +.yapfignore +CONTRIBUTING.md +CHANGELOG.md +pylintrc +pytest.ini diff --git a/.github/workflows/connectors.yml b/.github/workflows/connectors.yml new file mode 100644 index 000000000..8ab11b045 --- /dev/null +++ b/.github/workflows/connectors.yml @@ -0,0 +1,34 @@ +# Workflow to check if all singer connectors are installable +name: Singer connectors + +on: + push: + branches: [master] + pull_request: + branches: [master] + + workflow_dispatch: + +concurrency: + group: singer-connectors-${{ github.head_ref }} + cancel-in-progress: true + +jobs: + check: + runs-on: ubuntu-20.04 + strategy: + matrix: + python-version: [3.7] + + steps: + - name: Checking out repo + uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Check all connectors are installable + run: | + make all_connectors -e pw_acceptlicenses=y diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml index c0dcfe989..1a43717d9 100644 --- a/.github/workflows/dockerhub.yml +++ b/.github/workflows/dockerhub.yml @@ -1,12 +1,13 @@ -name: Docker Image to DockerHub +name: Docker Images to DockerHub on: release: types: - published -jobs: + workflow_dispatch: +jobs: build: runs-on: ubuntu-latest @@ -15,7 +16,7 @@ jobs: uses: actions/checkout@v2 - name: Generate tag - uses: frabert/replace-string-action@master + uses: frabert/replace-string-action@v2.0 id: genTag with: pattern: '.*(\d+\.\d+\.\d+).*' @@ -28,14 +29,43 @@ jobs: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build and push - id: docker_build + - name: Build and push main image + id: docker_build_main uses: docker/build-push-action@v2 with: file: ./Dockerfile + context: . push: true - tags: transferwiseworkspace/pipelinewise:${{ steps.genTag.outputs.replaced }} + tags: | + transferwiseworkspace/pipelinewise:${{ steps.genTag.outputs.replaced }} + transferwiseworkspace/pipelinewise:latest - - name: Image digest - run: echo ${{ steps.docker_build.outputs.digest }} + - name: Build and push barebone image + id: docker_build_barebone + uses: docker/build-push-action@v2 + with: + file: ./Dockerfile.barebone + context: . + push: true + tags: | + transferwiseworkspace/pipelinewise-barebone:${{ steps.genTag.outputs.replaced }} + transferwiseworkspace/pipelinewise-barebone:latest + + - name: Build and push main image with only default connectors + id: docker_build_default + uses: docker/build-push-action@v2 + with: + file: ./Dockerfile + context: . + build_args: | + "connectors=default" + push: true + tags: | + transferwiseworkspace/pipelinewise:${{ steps.genTag.outputs.replaced }}-default + transferwiseworkspace/pipelinewise:latest-default + - name: Image digests + run: | + echo ${{ steps.docker_build_main.outputs.digest }} + echo ${{ steps.docker_build_barebone.outputs.digest }} + echo ${{ steps.docker_build_default.outputs.digest }} diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml index c179e29c8..e28b9be18 100644 --- a/.github/workflows/linter.yml +++ b/.github/workflows/linter.yml @@ -8,13 +8,16 @@ on: workflow_dispatch: -jobs: - build: +concurrency: + group: linter-${{ github.head_ref }} + cancel-in-progress: true +jobs: + check: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: ["3.7", "3.8", "3.9"] steps: - name: Checking out repo @@ -28,22 +31,63 @@ jobs: continue-on-error: true run: ./scripts/ci_check_no_file_changes.sh python - - name: Set up Python ${{ matrix.container[1] }} - if: steps.check.outcome == 'failure' - uses: actions/setup-python@v2 + - name: Login to DockerHub + uses: docker/login-action@v1 with: - python-version: ${{ matrix.python-version }} + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Install dependencies - if: steps.check.outcome == 'failure' + - name: Check whether or not docker image for CI exists remotely + id: docker-image-exists-remotely + continue-on-error: true + run: docker manifest inspect transferwiseworkspace/pipelinewise:ci-py${{ matrix.python-version }} > /dev/null + + - name: Check if requirements or Dockerfile.CI has changed + id: dockerfile-setup-changed + continue-on-error: true + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: python3 scripts/check_any_file_changed.py setup.py Dockerfile.CI + + - name: Rebuild docker image if required + if: steps.docker-image-exists-remotely.outcome == 'failure' || steps.dockerfile-setup-changed.outcome == 'success' run: | - pip install -U pip setuptools wheel - pip install -Ue .[test] + docker build \ + --build-arg PYTHON_VERSION=${{ matrix.python-version }} \ + -t transferwiseworkspace/pipelinewise:ci-py${{ matrix.python-version }} \ + -f Dockerfile.CI . + + - name: Push docker image if rebuilt + if: steps.docker-image-exists-remotely.outcome == 'failure' || steps.dockerfile-setup-changed.outcome == 'success' + run: | + docker push transferwiseworkspace/pipelinewise:ci-py${{ matrix.python-version }} - name: Check code formatting if: steps.check.outcome == 'failure' - run: find pipelinewise tests -type f -name '*.py' | xargs unify --check-only + run: | + docker run --rm \ + -v $PWD:/app \ + transferwiseworkspace/pipelinewise:ci-py${{ matrix.python-version }} \ + bash -c "find pipelinewise tests -type f -name '*.py' | xargs unify --check-only" - name: Pylinting if: steps.check.outcome == 'failure' - run: pylint pipelinewise tests + run: | + docker run --rm -v $PWD:/app \ + transferwiseworkspace/pipelinewise:ci-py${{ matrix.python-version }} \ + bash -c "pylint pipelinewise tests" + + - name: Pep8 + if: steps.check.outcome == 'failure' + run: | + docker run --rm -v $PWD:/app \ + transferwiseworkspace/pipelinewise:ci-py${{ matrix.python-version }} \ + bash -c "flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics" + + - name: Pep8 complexity + if: steps.check.outcome == 'failure' + run: | + docker run --rm \ + -v $PWD:/app \ + transferwiseworkspace/pipelinewise:ci-py${{ matrix.python-version }} \ + bash -c "flake8 . --count --max-complexity=15 --max-line-length=120 --statistics" diff --git a/.github/workflows/pipelinewise_unit_tests.yml b/.github/workflows/pipelinewise_unit_tests.yml deleted file mode 100644 index 6eb33100b..000000000 --- a/.github/workflows/pipelinewise_unit_tests.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: Unit Tests - -on: - push: - branches: [master] - pull_request: - branches: [master] - - workflow_dispatch: - -jobs: - build: - - runs-on: ubuntu-20.04 - strategy: - matrix: - python-version: [3.6, 3.7, 3.8] - - steps: - - name: Checking out repo - uses: actions/checkout@v2 - - - name: Check if python changes are present - id: check - env: - GITHUB_REPO: ${{ github.repository }} - PR_NUMBER: ${{ github.event.pull_request.number }} - continue-on-error: true - run: ./scripts/ci_check_no_file_changes.sh python - - - name: Set up Python ${{ matrix.container[1] }} - if: steps.check.outcome == 'failure' - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - if: steps.check.outcome == 'failure' - run: | - pip install -U pip setuptools wheel - pip install -Ue .[test] - - - name: Run Unit tests - if: steps.check.outcome == 'failure' - run: | - export PIPELINEWISE_HOME=$PWD - pytest --cov=pipelinewise --cov-fail-under=69 -v tests/units diff --git a/.github/workflows/publish_doc.yml b/.github/workflows/publish_doc.yml new file mode 100644 index 000000000..18507716b --- /dev/null +++ b/.github/workflows/publish_doc.yml @@ -0,0 +1,43 @@ +name: Publish documentation +on: + push: + branches: [master] + pull_request: + branches: [master] + +concurrency: + group: doc-${{ github.head_ref }} + cancel-in-progress: true + +jobs: + publish: + runs-on: ubuntu-latest + environment: github-pages + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + fetch-depth: 0 + path: gh_doc_automation + + - name: check structure + run: ls -l gh_doc_automation + + - name: Check if doc has changed + working-directory: gh_doc_automation + id: check + env: + GITHUB_REPO: ${{ github.repository }} + PR_NUMBER: ${{ github.event.pull_request.number }} + continue-on-error: true + run: ./scripts/ci_check_no_file_changes.sh doc + + - name: Publish doc + if: steps.check.outcome == 'failure' + working-directory: gh_doc_automation + env: + GH_NAME: ap-github # can be anything + GH_EMAIL: ap-github@wise.com # can be anything + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Comes from GH itself, available at runtime, not stored in secrets + run: ./scripts/publish_docs.sh diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 000000000..29b4cfdf8 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,115 @@ +name: Tests + +on: + push: + branches: [master] + pull_request: + branches: [master] + + workflow_dispatch: + +concurrency: + group: tests-${{ github.head_ref }} + cancel-in-progress: true + +jobs: + unit_test: + runs-on: ubuntu-20.04 + strategy: + matrix: + python-version: ["3.7", "3.8", "3.9"] + + steps: + - name: Checking out repo + uses: actions/checkout@v2 + + - name: Check if python changes are present + id: check + env: + GITHUB_REPO: ${{ github.repository }} + PR_NUMBER: ${{ github.event.pull_request.number }} + continue-on-error: true + run: ./scripts/ci_check_no_file_changes.sh python + + - name: Set up Python ${{ matrix.python-version }} + if: steps.check.outcome == 'failure' + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + if: steps.check.outcome == 'failure' + run: | + pip install -U pip setuptools wheel + pip install -Ue .[test] + + - name: Run Unit tests + if: steps.check.outcome == 'failure' + run: | + pytest --cov=pipelinewise --cov-fail-under=72 -v tests/units + + e2e_tests: + runs-on: ubuntu-20.04 + environment: ci_tests + + steps: + - name: Checking out repo + uses: actions/checkout@v2 + + - name: Check if python changes are present + id: check + env: + GITHUB_REPO: ${{ github.repository }} + PR_NUMBER: ${{ github.event.pull_request.number }} + continue-on-error: true + run: ./scripts/ci_check_no_file_changes.sh python + + - name: Setup test containers + if: steps.check.outcome == 'failure' + run: docker-compose -f dev-project/docker-compose.yml up -d + + - name: Wait for test containers to be ready + if: steps.check.outcome == 'failure' + run: | + until docker logs pipelinewise_dev | grep "PipelineWise Dev environment is ready" + do + echo 'Sleeping for 1min'; + sleep 60; + done + + - name: Run target end-to-end tests + if: steps.check.outcome == 'failure' + env: + TARGET_SNOWFLAKE_ACCOUNT: ${{ secrets.TARGET_SNOWFLAKE_ACCOUNT }} + TARGET_SNOWFLAKE_AWS_ACCESS_KEY: ${{ secrets.TARGET_SNOWFLAKE_AWS_ACCESS_KEY }} + TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY: ${{ secrets.TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY }} + TARGET_SNOWFLAKE_DBNAME: ${{ secrets.TARGET_SNOWFLAKE_DBNAME }} + TARGET_SNOWFLAKE_FILE_FORMAT: ${{ secrets.TARGET_SNOWFLAKE_FILE_FORMAT }} + TARGET_SNOWFLAKE_PASSWORD: ${{ secrets.TARGET_SNOWFLAKE_PASSWORD }} + TARGET_SNOWFLAKE_S3_BUCKET: ${{ secrets.TARGET_SNOWFLAKE_S3_BUCKET }} + TARGET_SNOWFLAKE_S3_KEY_PREFIX: ${{ secrets.TARGET_SNOWFLAKE_S3_KEY_PREFIX }} + TARGET_SNOWFLAKE_SCHEMA: ${{ secrets.TARGET_SNOWFLAKE_SCHEMA }} + TARGET_SNOWFLAKE_STAGE: ${{ secrets.TARGET_SNOWFLAKE_STAGE }} + TARGET_SNOWFLAKE_USER: ${{ secrets.TARGET_SNOWFLAKE_USER }} + TARGET_SNOWFLAKE_WAREHOUSE: ${{ secrets.TARGET_SNOWFLAKE_WAREHOUSE }} + TAP_S3_CSV_AWS_KEY: ${{ secrets.TAP_S3_CSV_AWS_KEY }} + TAP_S3_CSV_AWS_SECRET_ACCESS_KEY: ${{ secrets.TAP_S3_CSV_AWS_SECRET_ACCESS_KEY }} + TAP_S3_CSV_BUCKET: ${{ secrets.TAP_S3_CSV_BUCKET }} + run: | + docker exec -t \ + -e TAP_S3_CSV_AWS_KEY=$TAP_S3_CSV_AWS_KEY \ + -e TAP_S3_CSV_AWS_SECRET_ACCESS_KEY=$TAP_S3_CSV_AWS_SECRET_ACCESS_KEY \ + -e TAP_S3_CSV_BUCKET=$TAP_S3_CSV_BUCKET \ + -e TARGET_SNOWFLAKE_ACCOUNT=$TARGET_SNOWFLAKE_ACCOUNT \ + -e TARGET_SNOWFLAKE_AWS_ACCESS_KEY=$TARGET_SNOWFLAKE_AWS_ACCESS_KEY \ + -e TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY=$TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY \ + -e TARGET_SNOWFLAKE_DBNAME=$TARGET_SNOWFLAKE_DBNAME \ + -e TARGET_SNOWFLAKE_FILE_FORMAT=$TARGET_SNOWFLAKE_FILE_FORMAT \ + -e TARGET_SNOWFLAKE_PASSWORD=$TARGET_SNOWFLAKE_PASSWORD \ + -e TARGET_SNOWFLAKE_S3_BUCKET=$TARGET_SNOWFLAKE_S3_BUCKET \ + -e TARGET_SNOWFLAKE_S3_KEY_PREFIX=$TARGET_SNOWFLAKE_S3_KEY_PREFIX \ + -e TARGET_SNOWFLAKE_SCHEMA=$TARGET_SNOWFLAKE_SCHEMA \ + -e TARGET_SNOWFLAKE_STAGE=$TARGET_SNOWFLAKE_STAGE \ + -e TARGET_SNOWFLAKE_USER=$TARGET_SNOWFLAKE_USER \ + -e TARGET_SNOWFLAKE_WAREHOUSE=$TARGET_SNOWFLAKE_WAREHOUSE \ + pipelinewise_dev pytest tests/end_to_end -vx diff --git a/CHANGELOG.md b/CHANGELOG.md index 55efc2e5b..eb0a3c2e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,122 @@ +0.41.0 (2022-02-10) +------------------- + +- Dropped support for python 3.6 +- Bump `ujson` from `4.3.0` to `5.1.0` +- Bump `pipelinewise-tap-s3-csv` to `2.0.0` +- Fix for config json files +- Fix: e2e tests fail when SF credentials are not present + +0.40.0 (2022-01-27) +------------------- +- Bump `pipelinewise-tap-kafka` from `5.0.1` to `5.1.0` + +0.39.1 (2022-01-26) +------------------- +- Bump `pipelinewise-tap-kafka` from `5.0.0` to `5.0.1` + +0.39.0 (2022-01-25) +------------------- +- Bump `pipelinewise-tap-kafka` from `4.0.1` to `5.0.0` +- Bump `pipelinewise-target-bigquery` from `1.1.1` to `1.2.0` +- Bump `pipelinewise-transform-field` from `2.2.0` to `2.3.0` +- Prevent usage of extended transformation feature when FastSync exists +- Fixed fastsync from postgres to bigquery +- Fixed an issue when `SplitGzipFile` doesn't work with binary mode + +0.38.0 (2022-01-14) +------------------- +- MySQL tap now connects to replica instance during fastsync if credentials are provided +- Added fastsync support for MongoDB Atlas +- Docker base image to Python 3.8 +- Bump `pyyaml` from `5.4.1` to `6.0` +- Bump `pipelinewise-target-snowflake` from `1.14.1` to `1.15.0` +- Bump `pipelinewise-tap-s3-csv` from `1.2.2` to `1.2.3` +- Bump `pipelinewise-tap-postgres` from `1.8.1` to `1.8.2` + +0.37.2 (2021-12-10) +------------------- +- Bump `pipelinewise-tap-github` from `1.0.2` to `1.0.3` + + +0.37.1 (2021-12-10) +------------------- +- Make a postfix for Snowflake schemas in end-to-end tests. +- Bump `google-cloud-bigquery` from `1.24.0` to `2.31.0` ([Changelog](https://github.com/googleapis/python-bigquery/blob/main/CHANGELOG.md#2310-2021-11-24)) + + +0.37.0 (2021-11-19) +------------------- + +*New* +- Added cleanup method for state file. +- Bump `pytest-cov` from `2.12.1` to `3.0.0` ([Changelog](https://github.com/pytest-dev/pytest-cov/blob/master/CHANGELOG.rst#300-2021-10-04)) +- Bump `joblib` from `1.0.0` to `1.1.0` +- Bump `flake8` from `3.9.2` to `4.0.1` +- Bump `jinja2` from `3.0.1` to `3.0.2` +- Bump `python-dotenv` from `0.19.0` to `0.19.1` +- Bump `target-snowflake` from `1.14.0` to `1.14.1` +- Bump `ansible` from `4.4.0` to `4.7.0` +- Bump `pytest` from `6.2.4` to `6.2.5` + +*Changes* +- Fully migrate CI to Github Actions. +- Update `ujson` requirement from `==4.1.*` to `>=4.1,<4.3` +- Update `tzlocal` requirement from `<2.2,>=2.0` to `>=2.0,<4.1` + +*Fixes* +- Make process in docker-compose file. +- proc.info parsing in a case cmdline is None! + + +0.36.0 (2021-09-30) +------------------- + +*New* +- Add new transformation type: **MASK-STRING-SKIP-ENDS** +- Bump `pipelinewise-target-snowflake` from `1.13.1` to `1.14.0` ([Changelog](https://github.com/transferwise/pipelinewise-target-snowflake/blob/master/CHANGELOG.md#1140-2021-09-30)) + - Support `date` property format + - Don't log record on failure to avoind exposing data + +*Changes* +- Use Makefile for installation +- Enforce PEP8 + +*Fixes* +- Dates out of range (with year > 9999) in FastSync from PG. +- Bump `pipelinewise-tap-postgres` from `1.8.0` to `1.8.1` ([Changelog](https://github.com/transferwise/pipelinewise-tap-postgres/blob/master/CHANGELOG.md#181-2021-09-23)) + - LOG_BASED: Handle dates with year > 9999. + - INCREMENTAL & FULL_TABLE: Avoid processing timestamps arrays as timestamp + +- `Decimal` not JSON serializable in FastSync MongoDB +- Don't use non-existent FastSync for MongoDB-Redshift pipelines. + + +0.35.2 (2021-08-17) +------------------- +- Bump `pipelinewise-tap-github` from `1.0.1` to `1.0.2` +- Update a few vulnerable or outdated dependencies to latest + +0.35.1 (2021-08-13) +------------------- +- Bump `pipelinewise-tap-github` from `1.0.0` to `1.0.1` +- Bump `pipelinewise-tap-kafka` from `4.0.0` to `4.0.1` +- Bump `tap-jira` from `2.0.0` to `2.0.1` +- Bump `pipelinewise-target-s3-csv` from `1.4.0` to `1.5.0` + +0.35.0 (2021-08-04) +------------------- +- Support `"none"` as a value for `--connectors` in `install.sh` script to install a stripped down Pipelinewise without any connectors. +- Optimize Dockerfile +- Do not log invalid json objects if they fail validation against json schema. +- Replace `github-tap` with fork `pipelinewise-tap-github` version `1.0.0` +- Add schema validation for github tap +- Increase batch_size_rows from 1M to 5M +- Increase split_file_chunk_size_mb from 2500 to 5000 +- Add latest tag to docker image +- Bump `pipelinewise-tap-s3-csv` from `1.2.1` to `1.2.2` +- Update pymongo requirement from `<3.12,>=3.10` to `>=3.10,<3.13` + 0.34.1 (2021-07-15) ------------------- - Bump `pipelinewise-target-snowflake` from `1.13.0` to `1.13.1` diff --git a/Dockerfile b/Dockerfile index 49c126cd5..6b595b1a8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,22 +1,43 @@ -FROM python:3.7-slim-buster +FROM python:3.8-slim-buster -RUN apt-get -qq update && apt-get -qqy install \ +ARG connectors=all + +RUN apt-get -qq update \ + && apt-get -qqy --no-install-recommends install \ apt-utils \ alien \ + gnupg \ libaio1 \ - mongo-tools \ mbuffer \ wget \ - && pip install --upgrade pip + && rm -rf /var/lib/apt/lists/* \ + && pip install -U --no-cache-dir pip -ARG connectors=all -COPY . /app +# Add Mongodb ppa +RUN wget -qO - https://www.mongodb.org/static/pgp/server-4.4.asc | apt-key add - \ + && echo "deb [ arch=amd64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/4.4 multiverse" | tee /etc/apt/sources.list.d/mongodb.list \ + && apt-get -qq update \ + && apt-get -qqy --no-install-recommends install \ + mongodb-database-tools \ + && rm -rf /var/lib/apt/lists/* -# Install Oracle Instant Client for tap-oracle if its in the connectors list -RUN bash -c "if grep -q \"tap-oracle\" <<< \"$connectors\"; then wget https://download.oracle.com/otn_software/linux/instantclient/193000/oracle-instantclient19.3-basiclite-19.3.0.0.0-1.x86_64.rpm -O /app/oracle-instantclient.rpm && alien -i /app/oracle-instantclient.rpm --scripts && rm -rf /app/oracle-instantclient.rpm ; fi" +COPY singer-connectors/ /app/singer-connectors/ +COPY Makefile /app + +RUN echo "setup connectors" \ + # Install Oracle Instant Client for tap-oracle if its in the connectors list + && bash -c "if grep -q \"tap-oracle\" <<< \"$connectors\"; then wget https://download.oracle.com/otn_software/linux/instantclient/193000/oracle-instantclient19.3-basiclite-19.3.0.0.0-1.x86_64.rpm -O /app/oracle-instantclient.rpm && alien -i /app/oracle-instantclient.rpm --scripts && rm -rf /app/oracle-instantclient.rpm ; fi" \ + && cd /app \ + && if [ "$connectors" = "all" ]; then make all_connectors -e pw_acceptlicenses=y; fi\ + && if [ "$connectors" = "default" ]; then make default_connectors -e pw_acceptlicenses=y; fi\ + && if [ "$connectors" = "extra" ]; then make extra_connectors -e pw_acceptlicenses=y; fi\ + && if [ "$connectors" != "all" ] && [ "$connectors" != "extra" ] && [ "$connectors" != "default" ] && [ "$connectors" != "none" ] && [ ! -z $connectors ]; then make connectors -e pw_connector=$connectors -e pw_acceptlicenses=y; fi + +COPY . /app -RUN cd /app \ - && ./install.sh --connectors=$connectors --acceptlicenses --nousage --notestextras \ +RUN echo "setup pipelinewise" \ + && cd /app \ + && make pipelinewise_no_test_extras -e pw_acceptlicenses=y\ && ln -s /root/.pipelinewise /app/.pipelinewise ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/Dockerfile.CI b/Dockerfile.CI new file mode 100644 index 000000000..4c05ea3c4 --- /dev/null +++ b/Dockerfile.CI @@ -0,0 +1,12 @@ +ARG PYTHON_VERSION + +FROM python:${PYTHON_VERSION}-bullseye + +WORKDIR /app + +COPY ./setup.py ./README.md ./ + +RUN pip install -U pip setuptools wheel && \ + pip install -Ue .[test] + +VOLUME /app diff --git a/Dockerfile.barebone b/Dockerfile.barebone new file mode 100644 index 000000000..af48710a4 --- /dev/null +++ b/Dockerfile.barebone @@ -0,0 +1,20 @@ +FROM python:3.8-slim-buster + +RUN apt-get -qq update \ + && apt-get -qqy --no-install-recommends install \ + apt-utils \ + alien \ + gnupg \ + libaio1 \ + wget \ + && rm -rf /var/lib/apt/lists/* \ + && pip install -U --no-cache-dir pip + +COPY . /app + +RUN echo "setup pipelinewise" \ + && cd /app \ + && make pipelinewise_no_test_extras -e pw_acceptlicenses=y \ + && ln -s /root/.pipelinewise /app/.pipelinewise + +ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..fc11f4571 --- /dev/null +++ b/Makefile @@ -0,0 +1,300 @@ +SHELL = bash +OK_MSG = \x1b[32m ✔\x1b[0m +FAIL_MSG = \x1b[31m ✖\x1b[0m +YELLOW = \x1b[33m +BLUE = \x1b[36m +RED = \x1b[31m +RESET_COLOR = \x1b[0m +PIPELINEWISE_HOME = $(shell pwd) +VENV_DIR = ${PIPELINEWISE_HOME}/.virtualenvs + +python ?= "python3" + +start_time:=$(shell date +%s) + +PIP_ARGS="[test]" + +pw_connector= + +define DEFAULT_CONNECTORS +tap-jira\ +tap-kafka\ +tap-mysql\ +tap-postgres\ +tap-s3-csv\ +tap-salesforce\ +tap-snowflake\ +tap-zendesk\ +tap-mongodb\ +tap-github\ +tap-slack\ +tap-mixpanel\ +tap-twilio\ +target-s3-csv\ +target-snowflake\ +target-redshift\ +target-postgres\ +target-bigquery\ +transform-field +endef + +define EXTRA_CONNECTORS +tap-adwords\ +tap-oracle\ +tap-zuora\ +tap-google-analytics\ +tap-shopify +endef + +define print_installed_connectors + @echo + @echo "--------------------------------------------------------------------------" + @echo "Installed components:" + @echo "--------------------------------------------------------------------------" + @echo + @echo "Component Version" + @echo "-------------------- -------" + @for i in $(shell ls $(VENV_DIR)); do\ + VERSION=`$(VENV_DIR)/$$i/bin/python3 -m pip list | grep "$$i[[:space:]]" | awk '{print $$2}'`;\ + printf "%-20s %s\n" $$i "$$VERSION";\ + done; + @echo "-------------------- -------" +endef + +define print_execute_time + $(eval end_time:=`date +%s`) + @echo + @echo "--------------------------------------------------------------------------" + @echo "$(1) installed successfully in $$(( $(end_time) - $(start_time) )) seconds" + @echo "--------------------------------------------------------------------------" +endef + +define clean_connectors + echo -n "Cleaning previous installations in $(VENV_DIR)/$(1)..." + rm -rf $(VENV_DIR)/$(1) + @echo -e "$(OK_MSG)" +endef + +define check_license + @echo "Checking license..." + @echo -e "$(YELLOW)" + @$(VENV_DIR)/$(1)/bin/python3 -m pip install pip-licenses==3.5.3 + @echo -e "$(RESET_COLOR)" + $(eval PKG_NAME:=`$(VENV_DIR)/$(1)/bin/pip-licenses|grep "$(1)[[:space:]]"| awk '{print $$$$1}'`) + $(eval PKG_VERSION:=`$(VENV_DIR)/$(1)/bin/pip-licenses | grep "$(1)[[:space:]]" | awk '{print $$$$2}'`) + $(eval PKG_LICENSE:=`$(VENV_DIR)/$(1)/bin/pip-licenses --from mixed | grep "$(1)[[:space:]]" | awk '{for (i=1; i<=NF-2; i++) $$$$i = $$$$(i+2); NF-=2; print}'`) + + $(eval MAIN_LICENSE:="Apache Software License") + + @if [[ "$(PKG_LICENSE)" != $(MAIN_LICENSE) && "$(PKG_LICENSE)" != "UNKNOWN" ]]; then\ + echo -e "$(RED)";\ + echo;\ + echo " | $(PKG_NAME) ($(PKG_VERSION)) is licensed under $(PKG_LICENSE)";\ + echo " |";\ + echo " | WARNING. The license of this connector is different than the default PipelineWise license ($(MAIN_LICENSE)).";\ + if [[ "$(ACCEPT_LICENSES)" != "YES" ]]; then\ + echo " | You need to accept the connector's license agreement to proceed.";\ + echo " |";\ + read -r -p " | Do you accept the [$(PKG_LICENSE)] license agreement of $(PKG_NAME) connector? [y/N] " response;\ + if [[ $$response != "y" && $$response != "Y" ]]; then\ + echo;\ + echo -e "$(RESET_COLOR)";\ + echo "EXIT. License agreement not accepted!";\ + exit 1;\ + fi;\ + else\ + echo " | You automatically accepted this license agreement by running this script with acceptlicenses=YES option.";\ + fi;\ + echo;\ + fi + @echo -e "$(RESET_COLOR)" + @echo -n "License accepted..." + @echo -e "$(OK_MSG)" +endef + +define make_virtualenv + @echo -n "Making Virtual Environment for $(1) in $(VENV_DIR)..." + @echo -e -n "$(YELLOW)" + @test -d $(VENV_DIR)/$(1) || $(python) -m venv $(VENV_DIR)/$(1) + @source $(VENV_DIR)/$(1)/bin/activate + @echo -e "$(OK_MSG)" + @echo -e -n "$(YELLOW)" + @$(VENV_DIR)/$(1)/bin/python3 -m pip install --upgrade pip setuptools wheel + @echo -e "$(RESET_COLOR)" + @echo -n "Python setup tools updated..." + @echo -e "$(OK_MSG)" + @echo -e -n "$(YELLOW)" + @test ! -s $(2)pre_requirements.txt ||\ + ($(VENV_DIR)/$(1)/bin/pip install --upgrade -r $(2)pre_requirements.txt\ + && echo -e "$(RESET_COLOR)"\ + && echo -n "Pre requirements installed..."\ + && echo -e "$(OK_MSG)") + @echo -e -n "$(YELLOW)" + @test ! -s $(2)requirements.txt ||\ + ($(VENV_DIR)/$(1)/bin/pip install --upgrade -r $(2)requirements.txt\ + && echo -e "$(RESET_COLOR)"\ + && echo -n "Requirements installed..."\ + && echo -e "$(OK_MSG)") + @echo -e -n "$(RESET_COLOR)" + @test ! -s $(2)setup.py ||\ + (echo "Installing the package..."\ + && echo -e "$(YELLOW)"\ + && $(VENV_DIR)/$(1)/bin/pip install --upgrade -e .$(PIP_ARGS)\ + && echo -e "$(RESET_COLOR)"\ + && echo -n "Package installation completed..."\ + && echo -e "$(OK_MSG)") + @echo -e "$(RESET_COLOR)" + $(call check_license,$(1)) +endef + + +define install_connectors + echo + echo "--------------------------------------------------------------------------" + echo "Installing $1 connector..." + echo "--------------------------------------------------------------------------" + if [[ ! -d singer-connectors/$1 ]]; then\ + echo "ERROR: Directory not exists and does not look like a valid singer connector: singer-connectors: singer-connectors/$1";\ + exit 1;\ + fi + $(call make_virtualenv,$1,singer-connectors/$1/) +endef + +define print_list_of_connectors + echo " $1" +endef + +help: .check_gettext .pw_logo + @echo + @echo " Targets" + @echo " =======" + @echo " pipelinewise Install the main PipelineWise component" + @echo " pipelinewise_no_test_extras Install the main Pipelinewise component without test extras" + @echo + @echo " all_connectors Install all connectors" + @echo " default_connectors Install default connectors" + @echo " extra_connectors Install only extra connectors" + @echo " connectors -e pw_connector=connector1,connector2,... Install specific connector(s)" + @echo + @echo " list_installed_components Show a list of installed components" + @echo " list_default_connectors Show a list of available default connectors" + @echo " list_extra_connectors Show a list of available extra connectors" + @echo + @echo " clean_all Clean all installed components" + @echo " clean -e pw_connector=connector1,connector2,... Clean a specific connector(s)" + @echo + @echo " Options" + @echo " =======" + @echo " -e pw_connector=connector1,connector2,... Define a list of connectors for installing or cleaning" + @echo " -e pw_acceptlicenses=y/Y/Yes/YES Forcing to accept the licenses automatically" + @echo + @echo " To start CLI" + @echo " ============" + @echo " $$ source $(VENV_DIR)/pipelinewise/bin/activate" + @echo " $$ export PIPELINEWISE_HOME=$(PIPELINEWISE_HOME)" + @echo " $$ pipelinewise status" + @echo + @echo "--------------------------------------------------------------------------" + + +pipelinewise: .check_gettext .pw_logo + $(call make_virtualenv,pipelinewise) + $(call print_execute_time,PipelineWise) + +pipelinewise_no_test_extras: .set_pip_args pipelinewise + +clean_all: + @echo -n "Cleaning previous installations in $(VENV_DIR)..." + @rm -rf $(VENV_DIR) + @echo -e "$(OK_MSG)" + +clean: +ifeq ($(pw_connector),) + @echo "use -e pw_connector=connector1,connector2,...." + @exit 1 +endif + $(eval space:= ) + $(eval space+= ) + $(eval comma:=,) + $(eval connectors_list:=$(subst $(comma),$(space),$(pw_connector))) + + @$(foreach var,$(connectors_list), $(call clean_connectors,$(var));) + +connectors: .check_license_env_var +ifeq ($(pw_connector),) + @echo "use -e pw_connector=connector1,connector2,...." + @exit 1 +endif + $(eval space:= ) + $(eval space+= ) + $(eval comma:=,) + $(eval connectors_list:=$(subst $(comma),$(space),$(pw_connector))) + + @$(foreach var,$(connectors_list), $(call install_connectors,$(var));) + $(call print_execute_time,Connectors) + + +all_connectors: default_connectors extra_connectors + @echo "Install all connectors..." + $(call print_execute_time,All connectors) + +default_connectors: .check_license_env_var + @echo "Installing default connectors..." + @$(foreach var,$(DEFAULT_CONNECTORS), $(call install_connectors,$(var));) + $(call print_execute_time,Default connectors) + +extra_connectors: .check_license_env_var + @echo "Installing extra connectors..." + @$(foreach var,$(EXTRA_CONNECTORS), $(call install_connectors,$(var));) + $(call print_execute_time,Extra connectors) + + +list_installed_components: + $(call print_installed_connectors) + +list_default_connectors: + @echo + @echo " ============================" + @echo " Available Default Connectors" + @echo " ============================" + @$(foreach var,$(DEFAULT_CONNECTORS), $(call print_list_of_connectors,$(var));) + @echo " ----------------------------" + +list_extra_connectors: + @echo + @echo " ============================" + @echo " Available Extra Connectors" + @echo " ============================" + @$(foreach var,$(EXTRA_CONNECTORS), $(call print_list_of_connectors,$(var));) + @echo " ----------------------------" + +.pw_logo: + @echo -e "$(BLUE)" + @(CURRENT_YEAR=$(shell date +"%Y") envsubst < motd) + @echo -e "$(RESET_COLOR)" + +.check_license_env_var: + $(eval ACCEPT_LICENSES:=NO) +ifeq ($(pw_acceptlicenses),y) + $(eval ACCEPT_LICENSES:=YES) +endif +ifeq ($(pw_acceptlicenses),Y) + $(eval ACCEPT_LICENSES:=YES) +endif +ifeq ($(pw_acceptlicenses),Yes) + $(eval ACCEPT_LICENSES:=YES) +endif +ifeq ($(pw_acceptlicenses),YES) + $(eval ACCEPT_LICENSES:=YES) +endif + +.check_gettext: + @echo -n "Checking gettext..." + @if ! ENVSUBST_LOC="$$(type -p "envsubst")" || [[ -z ENVSUBST_LOC ]]; then\ + echo -e "$(FAIL_MSG)" &&\ + echo "envsubst not found but it is required to run this script. Try to install gettext or gettext-base package" && exit 1;\ + fi + @echo -e "$(OK_MSG)" + +.set_pip_args: + $(eval PIP_ARGS:="") diff --git a/README.md b/README.md index 6394a421e..da3761343 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ consumes data from taps and do something with it, like load it into a file, API | Tap | **[Google Analytics](https://github.com/transferwise/pipelinewise-tap-google-analytics)** | Extra | [![PyPI version](https://badge.fury.io/py/pipelinewise-tap-google-analytics.svg)](https://badge.fury.io/py/tap-adwords) | Extracts data from Google Analytics | | Tap | **[Oracle](https://github.com/transferwise/pipelinewise-tap-oracle)** | Extra | [![PyPI version](https://badge.fury.io/py/pipelinewise-tap-oracle.svg)](https://badge.fury.io/py/pipelinewise-tap-oracle) | Extracts data from Oracle databases. Supporting Log-Based, Key-Based Incremental and Full Table replications | | Tap | **[Zuora](https://github.com/transferwise/pipelinewise-tap-zuora)** | Extra | [![PyPI version](https://badge.fury.io/py/pipelinewise-tap-zuora.svg)](https://badge.fury.io/py/pipelinewise-tap-zuora) | Extracts data from Zuora database using AQAA and REST extraction API with Key-Based incremental replications | -| Tap | **[GitHub](https://github.com/singer-io/tap-github)** | | [![PyPI version](https://badge.fury.io/py/tap-github.svg)](https://badge.fury.io/py/tap-github) | Extracts data from GitHub API using Personal Access Token and Key-Based incremental replications | +| Tap | **[GitHub](https://github.com/transferwise/pipelinewise-tap-github)** | | [![PyPI version](https://badge.fury.io/py/pipelinewise-tap-github.svg)](https://badge.fury.io/py/pipelinewise-tap-github) | Extracts data from GitHub API using Personal Access Token and Key-Based incremental replications | | Tap | **[Shopify](https://github.com/singer-io/tap-shopify)** | Extra | [![PyPI version](https://badge.fury.io/py/tap-shopify.svg)](https://badge.fury.io/py/tap-shopify) | Extracts data from Shopify API using Personal App API Password and date based incremental replications | | Tap | **[Slack](https://github.com/transferwise/pipelinewise-tap-slack)** | | [![PyPI version](https://badge.fury.io/py/pipelinewise-tap-slack.svg)](https://badge.fury.io/py/pipelinewise-tap-slack) | Extracts data from a Slack API using Bot User Token and Key-Based incremental replications | | Tap | **[Mixpanel](https://github.com/transferwise/pipelinewise-tap-mixpanel)** | | [![PyPI version](https://badge.fury.io/py/pipelinewise-tap-mixpanel.svg)](https://badge.fury.io/py/pipelinewise-tap-mixpanel) | Extracts data from the Mixpanel API. | @@ -113,13 +113,21 @@ You can run any pipelinewise command at this point. Tutorials to create and run * mongo-tools * mbuffer -2. Run the install script that installs the PipelineWise CLI and all supported singer connectors into separate virtual environments: +2. Run the Makefile that installs the PipelineWise CLI and all supported singer connectors into separate virtual environments: - ```sh - $ ./install.sh --connectors=all + ```shell + $ make pipelinewise all_connectors + ``` + Press `Y` to accept the license agreement of the required singer components. To automate the installation and accept every license agreement run: + ```shell + $ make pipelinewise all_connectors -e pw_acceptlicenses=y ``` - Press `Y` to accept the license agreement of the required singer components. To automate the installation and accept every license agreement run `./install --acceptlicenses` - Use the optional `--connectors=...,...` argument to install only a specific list of singer connectors. + And to install only a specific list of singer connectors: + ```shell + $ make connectors -e pw_connector=, + ``` + + Run `make` or `make -h` to see the help for Makefile and all options. 3. To start the CLI you need to activate the CLI virtual environment and set `PIPELINEWISE_HOME` environment variable: @@ -127,7 +135,7 @@ You can run any pipelinewise command at this point. Tutorials to create and run $ source {ACTUAL_ABSOLUTE_PATH}/.virtualenvs/pipelinewise/bin/activate $ export PIPELINEWISE_HOME={ACTUAL_ABSOLUTE_PATH} ``` - (The `ACTUAL_ABSOLUTE_PATH` differs on every system, the install script prints the correct commands once the installation completes) + (The `ACTUAL_ABSOLUTE_PATH` differs on every system, running `make -h` prints the correct commands for CLI) 4. Check if the installation was successful by running the `pipelinewise status` command: diff --git a/dev-project/.env b/dev-project/.env index c2856e7dc..670d647bf 100644 --- a/dev-project/.env +++ b/dev-project/.env @@ -21,6 +21,17 @@ TAP_MYSQL_USER=pipelinewise TAP_MYSQL_PASSWORD=secret TAP_MYSQL_DB=mysql_source_db +# ------------------------------------------------------------------------------ +# Test MySQL database credentials used as test source replica database +# ------------------------------------------------------------------------------ + +TAP_MYSQL_REPLICA_PORT=3306 +TAP_MYSQL_REPLICA_PORT_ON_HOST=14406 +TAP_MYSQL_REPLICA_ROOT_PASSWORD=secret +TAP_MYSQL_REPLICA_USER=pipelinewise +TAP_MYSQL_REPLICA_PASSWORD=secret +TAP_MYSQL_REPLICA_DB=mysql_source_db_2 + # ------------------------------------------------------------------------------ # Test Mongodb database credentials used as test source database # ------------------------------------------------------------------------------ diff --git a/dev-project/docker-compose.yml b/dev-project/docker-compose.yml index 4f1347bc6..50c3d05f8 100644 --- a/dev-project/docker-compose.yml +++ b/dev-project/docker-compose.yml @@ -2,12 +2,13 @@ version: '3' services: ### Primary container with PipelineWise CLI pipelinewise: - image: python:3.7.4-buster + image: python:3.8-buster container_name: pipelinewise_dev working_dir: /opt/pipelinewise entrypoint: /opt/pipelinewise/dev-project/entrypoint.sh environment: PIPELINEWISE_HOME: /opt/pipelinewise/dev-project + TAP_MYSQL_HOST: db_mysql_source TAP_MYSQL_PORT: ${TAP_MYSQL_PORT} TAP_MYSQL_PORT_ON_HOST: ${TAP_MYSQL_PORT_ON_HOST} @@ -16,6 +17,14 @@ services: TAP_MYSQL_PASSWORD: ${TAP_MYSQL_PASSWORD} TAP_MYSQL_DB: ${TAP_MYSQL_DB} + TAP_MYSQL_REPLICA_HOST: db_mysql_source_replica + TAP_MYSQL_REPLICA_PORT: ${TAP_MYSQL_REPLICA_PORT} + TAP_MYSQL_REPLICA_PORT_ON_HOST: ${TAP_MYSQL_REPLICA_PORT_ON_HOST} + TAP_MYSQL_REPLICA_ROOT_PASSWORD: ${TAP_MYSQL_REPLICA_ROOT_PASSWORD} + TAP_MYSQL_REPLICA_USER: ${TAP_MYSQL_REPLICA_USER} + TAP_MYSQL_REPLICA_PASSWORD: ${TAP_MYSQL_REPLICA_PASSWORD} + TAP_MYSQL_REPLICA_DB: ${TAP_MYSQL_REPLICA_DB} + TAP_POSTGRES_HOST: db_postgres_source TAP_POSTGRES_PORT: ${TAP_POSTGRES_PORT} TAP_POSTGRES_PORT_ON_HOST: ${TAP_POSTGRES_PORT_ON_HOST} @@ -38,6 +47,9 @@ services: TAP_MONGODB_DB: ${TAP_MONGODB_DB} TAP_MONGODB_ROOT_USER: ${TAP_MONGODB_ROOT_USER} TAP_MONGODB_ROOT_PASSWORD: ${TAP_MONGODB_ROOT_PASSWORD} + + PATH: /opt/pipelinewise/dev-project/.virtualenvs/pipelinewise/bin:${PATH} + volumes: - ../:/opt/pipelinewise # don't create container specific .virtualenvs files on host @@ -77,7 +89,7 @@ services: container_name: pipelinewise_dev_mysql_source ports: - ${TAP_MYSQL_PORT_ON_HOST}:${TAP_MYSQL_PORT} - command: --default-authentication-plugin=mysql_native_password --log-bin=mysql-bin --binlog-format=ROW + command: --default-authentication-plugin=mysql_native_password --server-id=1 --log-bin=mysql-bin --binlog-format=ROW --binlog-do-db=${TAP_MYSQL_DB} --binlog-do-db=${TAP_MYSQL_REPLICA_DB} environment: MYSQL_ROOT_PASSWORD: ${TAP_MYSQL_ROOT_PASSWORD} MYSQL_USER: ${TAP_MYSQL_USER} @@ -86,6 +98,20 @@ services: networks: - pipelinewise_network + db_mysql_source_replica: + image: mariadb:10.2.26 + container_name: pipelinewise_dev_mysql_source_replica + ports: + - ${TAP_MYSQL_REPLICA_PORT_ON_HOST}:${TAP_MYSQL_REPLICA_PORT} + command: --default-authentication-plugin=mysql_native_password --server-id=2 --log-bin=mysql-bin --relay-log=mysql-relay-log --replicate-do-db=${TAP_MYSQL_REPLICA_DB} + environment: + MYSQL_ROOT_PASSWORD: ${TAP_MYSQL_REPLICA_ROOT_PASSWORD} + MYSQL_USER: ${TAP_MYSQL_REPLICA_USER} + MYSQL_PASSWORD: ${TAP_MYSQL_REPLICA_PASSWORD} + MYSQL_DATABASE: ${TAP_MYSQL_REPLICA_DB} + networks: + - pipelinewise_network + db_mongo_source: container_name: "pipelinewise_dev_mongodb_source" image: "mongo:4.2-bionic" diff --git a/dev-project/entrypoint.sh b/dev-project/entrypoint.sh index a90b2a450..25ca6e432 100755 --- a/dev-project/entrypoint.sh +++ b/dev-project/entrypoint.sh @@ -1,14 +1,24 @@ #!/usr/bin/env bash +set -e + +# Add Mongodb ppa +wget -qO - https://www.mongodb.org/static/pgp/server-4.4.asc | apt-key add - +echo "deb [ arch=amd64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/4.4 multiverse" | tee /etc/apt/sources.list.d/mongodb.list + # Install OS dependencies apt-get update -apt-get install -y mariadb-client postgresql-client alien libaio1 mongo-tools mbuffer gettext-base - -wget https://repo.mongodb.org/apt/ubuntu/dists/bionic/mongodb-org/4.2/multiverse/binary-amd64/mongodb-org-shell_4.2.7_amd64.deb -dpkg -i ./mongodb-org-shell_4.2.7_amd64.deb && rm mongodb-org-shell_4.2.7_amd64.deb +apt-get install -y --no-install-recommends \ + alien \ + gettext-base \ + libaio1 \ + mariadb-client \ + mbuffer \ + mongodb-database-tools \ + mongodb-org-shell \ + postgresql-client -# Change to dev-project folder -cd dev-project +rm -rf /var/lib/apt/lists/* \ # Install Oracle Instant Client required for tap-oracle # ORA_INSTACLIENT_URL=https://download.oracle.com/otn_software/linux/instantclient/193000/oracle-instantclient19.3-basiclite-19.3.0.0.0-1.x86_64.rpm @@ -17,17 +27,17 @@ cd dev-project # alien -i oracle-instantclient.rpm --scripts # rm -f oracle-instantclient.rpm -# Build test databasese -../tests/db/tap_mysql_db.sh -../tests/db/tap_postgres_db.sh +# Build test databases -./mongo/init_rs.sh -../tests/db/tap_mongodb.sh +tests/db/tap_mysql_db.sh +tests/db/tap_postgres_db.sh -../tests/db/target_postgres.sh +dev-project/mongo/init_rs.sh +tests/db/tap_mongodb.sh +tests/db/target_postgres.sh -# Install PipelineWise in the container -../install.sh --acceptlicenses --nousage --connectors=target-snowflake,target-postgres,target-bigquery,tap-mysql,tap-postgres,tap-mongodb,transform-field,tap-s3-csv +# Install PipelineWise and connectors in the container +make pipelinewise connectors -e pw_acceptlicenses=y -e pw_connector=target-snowflake,target-postgres,target-bigquery,tap-mysql,tap-postgres,tap-mongodb,transform-field,tap-s3-csv if [[ $? != 0 ]]; then echo echo "ERROR: Docker container not started. Failed to install one or more PipelineWise components." diff --git a/dev-project/pipelinewise-config/tap_mongodb_to_pg.yaml b/dev-project/pipelinewise-config/tap_mongodb_to_pg.yaml index 8aedf7761..8825b2d4a 100644 --- a/dev-project/pipelinewise-config/tap_mongodb_to_pg.yaml +++ b/dev-project/pipelinewise-config/tap_mongodb_to_pg.yaml @@ -42,3 +42,6 @@ schemas: - table_name: "my_collection" replication_method: "LOG_BASED" + + - table_name: "all_datatypes" + replication_method: "LOG_BASED" diff --git a/docs/concept/fastsync.rst b/docs/concept/fastsync.rst index 179e2bf24..d5acd5be5 100644 --- a/docs/concept/fastsync.rst +++ b/docs/concept/fastsync.rst @@ -45,4 +45,11 @@ Fast Sync exists only between the following tap and target components: +----------------------------+----------------------------------+ | :ref:`tap-mongodb` | **->** :ref:`target-postgres` | +----------------------------+----------------------------------+ - +| :ref:`tap-mysql` | **->** :ref:`target-bigquery` | ++----------------------------+----------------------------------+ +| :ref:`tap-postgres` | **->** :ref:`target-bigquery` | ++----------------------------+----------------------------------+ +| :ref:`tap-s3-csv` | **->** :ref:`target-bigquery` | ++----------------------------+----------------------------------+ +| :ref:`tap-mongodb` | **->** :ref:`target-bigquery` | ++----------------------------+----------------------------------+ diff --git a/docs/conf.py b/docs/conf.py index 3cb714e3e..5aba8a4e1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,6 +14,7 @@ # import sys # sys.path.insert(0, os.path.abspath('.')) import datetime +import importlib.metadata def setup(app): @@ -25,7 +26,7 @@ def setup(app): project = 'PipelineWise' copyright = f'{datetime.datetime.now().year}, Wise Ltd.' author = 'Wise' -version = '0.34.0' +version = importlib.metadata.version('pipelinewise') # -- General configuration --------------------------------------------------- diff --git a/docs/connectors/taps/github.rst b/docs/connectors/taps/github.rst index c4fa7b05e..d2b9162c7 100644 --- a/docs/connectors/taps/github.rst +++ b/docs/connectors/taps/github.rst @@ -33,6 +33,7 @@ Example YAML for ``tap-github``: name: "Github" # Name of the tap type: "tap-github" # !! THIS SHOULD NOT CHANGE !! owner: "somebody@foo.com" # Data owner to contact + sync_period: "*/90 * * * *" # Period in which the tap will run #send_alert: False # Optional: Disable all configured alerts on this tap @@ -41,8 +42,29 @@ Example YAML for ``tap-github``: # ------------------------------------------------------------------------------ db_conn: access_token: "" # Github access token with at least the repo scope - repository: "transferwise/pipelinewise" # Path to one or multiple repositories that you want to extract data from + organization: "gnome" # The organization you want to extract the data from + # Required when repos_include/repository isn't present + # OR + # Required when repos_exclude contains wildcard matchers + # OR + # Required when repos_include/repository contains wildcard matchers + repos_include: "gnome* polari" # Allow list strategy to extract selected repos data from organization. # Each repo path should be space delimited. + # Supports wildcard matching + # Values also valid: singer-io/tap-github another-org/tap-octopus + # Org prefix not allowed when organization is present + repos_exclude: "*tests* api-docs" # Deny list to extract all repos from organization except the ones listed. + # Each repo path should be space delimited. + # Supports wildcard matching + # Requires organization + # Org prefix not allowed in repos_exclude + repository: "gnome/gnome-software" # (DEPRECATED) Path to one or multiple repositories that you want to extract data from organization (has priority over repos_exclude)) + # Each repo path should be space delimited. + # Org prefix not allowed when organization is present + include_archived: false # Optional: true/false to include archived repos. Default false + include_disabled: false # Optional: true/false to include disabled repos. Default false + max_rate_limit_wait_seconds: 600 # Optional: Max time to wait if you hit the github api limit. Default to 600s + # ------------------------------------------------------------------------------ # Destination (Target) - Target properties diff --git a/docs/connectors/taps/kafka.rst b/docs/connectors/taps/kafka.rst index f468f63e1..b1c31e2a2 100644 --- a/docs/connectors/taps/kafka.rst +++ b/docs/connectors/taps/kafka.rst @@ -12,12 +12,7 @@ Messages from kafka topics are extracted into the following fields: * ``MESSAGE``: The original and full kafka message * `Dynamic primary key columns`: (Optional) Fields extracted from the Kafka JSON messages by JSONPath selector(s). -Consuming Kafka messages -'''''''''''''''''''''''' - -Tap Kafka saves consumed messages into a local disk storage and sends commit messages to Kafka after every -consumed message. A batching mechanism keeps maintaining of deleting and flushing messages from the local storage -and sends singer compatible messages in small batches to standard output. +Supported message formats: JSON and Protobuf (experimental). Configuring what to replicate @@ -60,19 +55,34 @@ Example YAML for ``tap-kafka``: primary_keys: transfer_id: "/transferMetadata/transferId" + #initial_start_time: # (Default: latest) Start time reference of the message consumption if + # no bookmarked position in state.json. One of: latest, earliest or an + # ISO-8601 formatted timestamp string. + # -------------------------------------------------------------------------- - # Kafka Consumer optional parameters + # Kafka Consumer optional parameters. Commented values are default values. # -------------------------------------------------------------------------- #max_runtime_ms: 300000 # The maximum time for the tap to collect new messages from Kafka topic. #consumer_timeout_ms: 10000 # KafkaConsumer setting. Number of milliseconds to block during message iteration before raising StopIteration #session_timeout_ms: 30000 # KafkaConsumer setting. The timeout used to detect failures when using Kafka’s group management facilities. #heartbeat_interval_ms: 10000 # KafkaConsumer setting. The expected time in milliseconds between heartbeats to the consumer coordinator when using Kafka’s group management facilities. #max_poll_interval_ms: 300000 # KafkaConsumer setting. The maximum delay between invocations of poll() when using consumer group management. - #max_poll_records: 500 # KafkaConsumer setting. The maximum number of records returned in a single call to poll(). #commit_interval_ms: 5000 # Number of milliseconds between two commits. This is different than the kafka auto commit feature. Tap-kafka sends commit messages automatically but only when the data consumed successfully and persisted to local store. - #local_store_dir: ./tap-kafka-local-store # Path to the local store with consumed kafka messages - #local_store_batch_size_rows: 1000 # Number of messages to write to disk in one go. This can avoid high I/O issues when messages written to local store disk too frequently. + + # -------------------------------------------------------------------------- + # Protobuf support - Experimental + # -------------------------------------------------------------------------- + #message_format: protobuf # (Default: json) Supported message formats are json and protobuf. + #proto_schema: | # Protobuf message format in .proto syntax. Required if the message_format is protobuf. + # syntax = "proto3"; + # + # message ProtoMessage { + # string query = 1; + # int32 page_number = 2; + # int32 result_per_page = 3; + # } + #proto_classess_dir: # (Default: current working dir) Directory where to store runtime compiled proto classes # ------------------------------------------------------------------------------ diff --git a/docs/connectors/taps/mongodb.rst b/docs/connectors/taps/mongodb.rst index 65859304f..4861b9505 100644 --- a/docs/connectors/taps/mongodb.rst +++ b/docs/connectors/taps/mongodb.rst @@ -97,6 +97,7 @@ Example YAML for ``tap-mongodb``: db_conn: host: "mongodb_host1,mongodb_host2,mongodb_host3" # Mongodb host(s) port: 27017 # Mongodb port + srv: "false" # For MongoDB Atlas `srv` should be "true" and `port` will be ignored user: "PipelineWiseUser" # Mongodb user password: "mY_VerY_StRonG_PaSSwoRd" # Mongodb plain string or vault encrypted auth_database: "admin" # Mongodb database to authenticate on @@ -143,3 +144,17 @@ Example YAML for ``tap-mongodb``: # default replication method is LOG_BASED - table_name: "my_other_collection" + + +Example connection to MongoDB Atlas +""""""""""""""""""""""""""""""""""" + +.. code-block:: bash + + db_conn: + srv: "true" + host: "xxxxxxxxx.xxxxx.mongodb.net" + auth_database: "admin" # the Mongodb database name to authenticate on + dbname: "db-name" # Mongodb database name to sync from + user: "user-name" # User with read roles + password: "password" # Plain string or vault encrypted diff --git a/docs/installation_guide/installation.rst b/docs/installation_guide/installation.rst index 5646b9f64..e5bdd43f1 100644 --- a/docs/installation_guide/installation.rst +++ b/docs/installation_guide/installation.rst @@ -98,14 +98,14 @@ PipelineWise CLI and every supported singer connectors into separated virtual en $ git clone https://github.com/transferwise/pipelinewise.git $ cd ./pipelinewise - $ ./install.sh --connectors=all + $ make pipelinewise all_connectors Press ``Y`` to accept the license agreement of the required singer components. To automate the installation and accept every license agreement run ``./install --acceptlicenses``. .. code-block:: bash - $ ./install.sh --connectors=all + $ make pipelinewise all_connectors (...installation usually takes 5-10 minutes...) @@ -131,7 +131,7 @@ running: .. code-block:: bash - $ ./install.sh --connectors=tap-mysql,tap-postgres,target-snowflake + $ make pipelinewise connectors -e pw_connector=tap-mysql,tap-postgres,target-snowflake .. warning:: diff --git a/docs/project/contribution.rst b/docs/project/contribution.rst index 2340f5d2d..f8f7f1fc8 100644 --- a/docs/project/contribution.rst +++ b/docs/project/contribution.rst @@ -13,7 +13,7 @@ To add new item to the supported :ref:`taps` or :ref:`targets` please follow the Adding new tap '''''''''''''' -1. Make sure your tap is following the Singer specification by following the following criterias: +1. Make sure your tap is following the Singer specification by following the following criteria: * Configuration JSON is mandatory and defined with the ``--config`` CLI argument @@ -54,38 +54,42 @@ Adding new tap | **tap_config_extras** | Anything else that's required in the tap ``config.json`` to run. This can be static or dynamically generated values at runtime. | +----------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+ -3. Add your tap id to the allowed values in `tap.json `_ +3. Add your tap type to the allowed values in `tap.json `_ +4. Add your tap type to `ConnectorType Enum `_ -4. Add your tap to the `singer-connectors `_ directory. +5. Add your tap to the `singer-connectors `_ directory. The new directory should have only one ``requirements.txt`` file with a reference and version of the tap in PyPI. Some taps are forks of the community versions and customised to PipelineWise. If new fork or project required in PyPI please mention this the PR. The Wise team will create the PyPi package. -5. Document your tap in the :ref:`taps_list` section with some YAML examples. +6. Document your tap in the :ref:`taps_list` section with some YAML examples. The editable documentation is at `GitHub Taps in RST format `_. -6. Send a Pull Request to the `PipelineWise Github Repository `_. +7. Send a Pull Request to the `PipelineWise Github Repository `_. Adding new target ''''''''''''''''' -1. Make sure your target is following the Singer specification and meets the following criterias: +1. Make sure your target is following the Singer specification and meets the following criteria: * Configuration JSON is mandatory and defined with the ``--config`` CLI argument * State messages printed to standard output. PipelineWise compatible target connectors should send `Singer State Messages `_ in the original format to ``STDOUT``. Other components of PipelineWise will process it at a later stage making it - compatible across every target connetor. + compatible across every target connector. 2. Add your target to `singer-connectors `_ directory. The new directory should have only one ``requirements.txt`` file with a reference and version of the tap in PyPI. Some taps are forks of the community versions and customised to PipelineWise. If new fork or project required in PyPI please mention this the PR. The Wise team will create the PyPi package. -3. Document your target in the :ref:`targets_list` section with some YAML examples. - The editable documentation is at `GitHub Targets in RST format `_. +3. Add your target type to the allowed values in `target.json `_ -4. Send a Pull Request to the `PipelineWise Github Repository `_. +4. Add your target type to `ConnectorType Enum `_ +5. Document your target in the :ref:`targets_list` section with some YAML examples. + The editable documentation is at `GitHub Targets in RST format `_. + +6. Send a Pull Request to the `PipelineWise Github Repository `_. diff --git a/docs/user_guide/cli.rst b/docs/user_guide/cli.rst index f53596f13..a75658155 100644 --- a/docs/user_guide/cli.rst +++ b/docs/user_guide/cli.rst @@ -130,3 +130,17 @@ validate Validates a project directory with YAML tap and target files. :--dir: relative path to the project directory with YAML taps and targets. + + +Environment variables +--------------------- + +`PIPELINEWISE_HOME` +""""""""""""""""""" + +Configures the directory at which PipelineWise expects to find installed tap and targets. + +`PIPELINEWISE_CONFIG_DIRECTORY` +""""""""""""""""""""""""""""""" + +Overrides the default directory at which PipelineWise expects to find configuration files generated by `pipelinewise import`. diff --git a/docs/user_guide/transformations.rst b/docs/user_guide/transformations.rst index 65aed97cc..1e6e72b0b 100644 --- a/docs/user_guide/transformations.rst +++ b/docs/user_guide/transformations.rst @@ -40,6 +40,8 @@ The following transformations can be added optionally into the :ref:`yaml_config * **MASK-HIDDEN**: Transforms any string column value to 'hidden'. +* **MASK-STRING-SKIP-ENDS-n**: Transforms string columns to masked version skipping first and last n characters, e.g. MASK-STRING-SKIP-ENDS-3 + .. _transformation_validation: @@ -90,7 +92,7 @@ in the :ref:`yaml_configuration`: - column: "property_name" equals: 'passwordHash' - # Tip: Use 'regex_match' instead of 'equal' if you need + # Tip: Use 'regex_match' instead of 'equals' if you need # more complex matching criteria. For example: # regex_match: 'password|salt|passwordHash' @@ -101,6 +103,22 @@ in the :ref:`yaml_configuration`: equals: 'com.transferwise.fx.user.User' - column: "property_name" equals: 'passwordHash' + + - column: "column_3" + type: "HASH" + when: + - column: "json_column" + field_path: 'metadata/property_name' + equals: 'passwordHash' + + - table_name: "users" + replication_method: "LOG_BASED" + transformations: + - column: "json_column" + field_paths: + - "user/info/phone" + - "user/info/addresses/0" + type: "SET-NULL" ... ... diff --git a/install.sh b/install.sh deleted file mode 100755 index 1bb5ca109..000000000 --- a/install.sh +++ /dev/null @@ -1,240 +0,0 @@ -#!/bin/bash - -# Exit script on first error -set -e - -# Capture start_time -start_time=`date +%s` - -# Source directory defined as location of install.sh -SRC_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - -# Install pipelinewise venvs in the present working directory -PIPELINEWISE_HOME=$(pwd) -VENV_DIR=${PIPELINEWISE_HOME}/.virtualenvs - -check_license() { - python3 -m pip install pip-licenses - - echo - echo "Checking license..." - PKG_NAME=`pip-licenses | grep "$1[[:space:]]" | awk '{print $1}'` - PKG_VERSION=`pip-licenses | grep "$1[[:space:]]" | awk '{print $2}'` - PKG_LICENSE=`pip-licenses --from mixed | grep "$1[[:space:]]" | awk '{for (i=1; i<=NF-2; i++) $i = $(i+2); NF-=2; print}'` - - # Any License Agreement that is not Apache Software License (2.0) has to be accepted - MAIN_LICENSE="Apache Software License" - if [[ $PKG_LICENSE != $MAIN_LICENSE && $PKG_LICENSE != 'UNKNOWN' ]]; then - echo - echo " | $PKG_NAME ($PKG_VERSION) is licensed under $PKG_LICENSE" - echo " |" - echo " | WARNING. The license of this connector is different than the default PipelineWise license ($MAIN_LICENSE)." - - if [[ $ACCEPT_LICENSES != "YES" ]]; then - echo " | You need to accept the connector's license agreement to proceed." - echo " |" - read -r -p " | Do you accept the [$PKG_LICENSE] license agreement of $PKG_NAME connector? [y/N] " response - case "$response" in - [yY][eE][sS]|[yY]) - ;; - *) - echo - echo "EXIT. License agreement not accepted" - exit 1 - ;; - esac - else - echo " | You automatically accepted this license agreement by running this script with --acceptlicenses option." - fi - - fi -} - -clean_virtualenvs() { - echo "Cleaning previous installations in $VENV_DIR" - rm -rf $VENV_DIR -} - -make_virtualenv() { - echo "Making Virtual Environment for [$1] in $VENV_DIR" - python3 -m venv $VENV_DIR/$1 - source $VENV_DIR/$1/bin/activate - python3 -m pip install --upgrade pip setuptools wheel - - if [ -f "requirements.txt" ]; then - python3 -m pip install --upgrade -r requirements.txt - fi - if [ -f "setup.py" ]; then - PIP_ARGS= - if [[ ! $NO_TEST_EXTRAS == "YES" ]]; then - PIP_ARGS=$PIP_ARGS"[test]" - fi - - python3 -m pip install --upgrade -e .$PIP_ARGS - fi - - echo "" - - check_license $1 - deactivate -} - -install_connector() { - echo - echo "--------------------------------------------------------------------------" - echo "Installing $1 connector..." - echo "--------------------------------------------------------------------------" - - CONNECTOR_DIR=$SRC_DIR/singer-connectors/$1 - if [[ ! -d $CONNECTOR_DIR ]]; then - echo "ERROR: Directory not exists and does not look like a valid singer connector: $CONNECTOR_DIR" - exit 1 - fi - - cd $CONNECTOR_DIR - make_virtualenv $1 -} - -print_installed_connectors() { - cd $SRC_DIR - - echo - echo "--------------------------------------------------------------------------" - echo "Installed components:" - echo "--------------------------------------------------------------------------" - echo - echo "Component Version" - echo "-------------------- -------" - - for i in `ls $VENV_DIR`; do - source $VENV_DIR/$i/bin/activate - VERSION=`python3 -m pip list | grep "$i[[:space:]]" | awk '{print $2}'` - printf "%-20s %s\n" $i "$VERSION" - done - - if [[ $CONNECTORS != "all" ]]; then - echo - echo "WARNING: Not every singer connector installed. If you are missing something use the --connectors=...,... argument" - echo " with an explicit list of required connectors or use the --connectors=all to install every available" - echo " connector" - fi -} - -# Parse command line arguments -for arg in "$@"; do - case $arg in - # Auto accept license agreemnets. Useful if PipelineWise installed by an automated script - --acceptlicenses) - ACCEPT_LICENSES="YES" - ;; - # Do not print usage information at the end of the install - --nousage) - NO_USAGE="YES" - ;; - # Install with test requirements that allows running tests - --notestextras) - NO_TEST_EXTRAS="YES" - ;; - # Install extra connectors - --connectors=*) - CONNECTORS="${arg#*=}" - shift - ;; - # Clean previous installation - --clean) - clean_virtualenvs - exit 0 - ;; - *) - echo "Invalid argument: $arg" - exit 1 - ;; - esac -done - -# Welcome message -if ! ENVSUBST_LOC="$(type -p "envsubst")" || [[ -z ENVSUBST_LOC ]]; then - echo "envsubst not found but it's required to run this script. Try to install gettext or gettext-base package" - exit 1 -fi - -CURRENT_YEAR=$(date +"%Y") envsubst < $SRC_DIR/motd - -# Install PipelineWise core components -cd $SRC_DIR -make_virtualenv pipelinewise - -# Set default and extra singer connectors -DEFAULT_CONNECTORS=( - tap-jira - tap-kafka - tap-mysql - tap-postgres - tap-s3-csv - tap-salesforce - tap-snowflake - tap-zendesk - tap-mongodb - tap-github - tap-slack - tap-mixpanel - tap-twilio - target-s3-csv - target-snowflake - target-redshift - target-postgres - target-bigquery - transform-field -) -EXTRA_CONNECTORS=( - tap-adwords - tap-oracle - tap-zuora - tap-google-analytics - tap-shopify -) - -# Install only the default connectors if --connectors argument not passed -if [[ -z $CONNECTORS ]]; then - for i in ${DEFAULT_CONNECTORS[@]}; do - install_connector $i - done - - -# Install every available connectors if --connectors=all passed -elif [[ $CONNECTORS == "all" ]]; then - for i in ${DEFAULT_CONNECTORS[@]}; do - install_connector $i - done - for i in ${EXTRA_CONNECTORS[@]}; do - install_connector $i - done - -# Install the selected connectors if --connectors argument passed -elif [[ ! -z $CONNECTORS ]]; then - OLDIFS=$IFS - IFS=, - for connector in $CONNECTORS; do - install_connector $connector - done - IFS=$OLDIFS -fi - -# Capture end_time -end_time=`date +%s` -echo -echo "--------------------------------------------------------------------------" -echo "PipelineWise installed successfully in $((end_time-start_time)) seconds" -echo "--------------------------------------------------------------------------" - -print_installed_connectors -if [[ $NO_USAGE != "YES" ]]; then - echo - echo "To start CLI:" - echo " $ source $VENV_DIR/pipelinewise/bin/activate" - echo " $ export PIPELINEWISE_HOME=$PIPELINEWISE_HOME" - - echo " $ pipelinewise status" - echo - echo "--------------------------------------------------------------------------" -fi diff --git a/pipelinewise/cli/__init__.py b/pipelinewise/cli/__init__.py index fd2af924d..9f42b2d42 100644 --- a/pipelinewise/cli/__init__.py +++ b/pipelinewise/cli/__init__.py @@ -19,10 +19,13 @@ __version__ = get_distribution('pipelinewise').version USER_HOME = os.path.expanduser('~') -CONFIG_DIR = os.path.join(USER_HOME, '.pipelinewise') +DEFAULT_CONFIG_DIR = os.path.join(USER_HOME, '.pipelinewise') +CONFIG_DIR = os.environ.get('PIPELINEWISE_CONFIG_DIRECTORY', DEFAULT_CONFIG_DIR) PROFILING_DIR = os.path.join(CONFIG_DIR, 'profiling') PIPELINEWISE_DEFAULT_HOME = os.path.join(USER_HOME, 'pipelinewise') -PIPELINEWISE_HOME = os.path.abspath(os.environ.setdefault('PIPELINEWISE_HOME', PIPELINEWISE_DEFAULT_HOME)) +PIPELINEWISE_HOME = os.path.abspath( + os.environ.setdefault('PIPELINEWISE_HOME', PIPELINEWISE_DEFAULT_HOME) +) VENV_DIR = os.path.join(PIPELINEWISE_HOME, '.virtualenvs') COMMANDS = [ 'init', @@ -61,7 +64,9 @@ def __init_logger(log_file=None, debug=False): return logger -def __init_profiler(profiler_arg: bool, logger: logging.Logger) -> Tuple[Optional[Profile], Optional[str]]: +def __init_profiler( + profiler_arg: bool, logger: logging.Logger +) -> Tuple[Optional[Profile], Optional[str]]: """ Initialise profiling environment by creating a cprofile.Profiler instance, a folder where pstats can be dumped Args: @@ -82,9 +87,10 @@ def __init_profiler(profiler_arg: bool, logger: logging.Logger) -> Tuple[Optiona logger.debug('Profiler created.') - profiling_dir = os.path.join(PROFILING_DIR, - f'{datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")}_{generate_random_string(10)}' - ) + profiling_dir = os.path.join( + PROFILING_DIR, + f'{datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")}_{generate_random_string(10)}', + ) try: os.makedirs(profiling_dir) @@ -103,10 +109,12 @@ def __init_profiler(profiler_arg: bool, logger: logging.Logger) -> Tuple[Optiona return None, None -def __disable_profiler(profiler: Optional[Profile], - profiling_dir: Optional[str], - pstat_filename: Optional[str], - logger: logging.Logger): +def __disable_profiler( + profiler: Optional[Profile], + profiling_dir: Optional[str], + pstat_filename: Optional[str], + logger: logging.Logger, +): """ Disable given profiler and dump pipelinewise stats into a pStat file Args: @@ -145,32 +153,42 @@ def main(): parser.add_argument('--target', type=str, default='*', help='"Name of the target') parser.add_argument('--tap', type=str, default='*', help='Name of the tap') parser.add_argument('--tables', type=str, help='List of tables to sync') - parser.add_argument('--dir', type=str, default='*', help='Path to directory with config') + parser.add_argument( + '--dir', type=str, default='*', help='Path to directory with config' + ) parser.add_argument('--name', type=str, default='*', help='Name of the project') parser.add_argument('--secret', type=str, help='Path to vault password file') parser.add_argument('--string', type=str) - parser.add_argument('--version', - action='version', - help='Displays the installed versions', - version='PipelineWise {} - Command Line Interface'.format(__version__)) + parser.add_argument( + '--version', + action='version', + help='Displays the installed versions', + version='PipelineWise {} - Command Line Interface'.format(__version__), + ) parser.add_argument('--log', type=str, default='*', help='File to log into') - parser.add_argument('--extra_log', - default=False, - required=False, - help='Copy singer and fastsync logging into PipelineWise logger', - action='store_true') - parser.add_argument('--debug', - default=False, - required=False, - help='Forces the debug mode with logging on stdout and log level debug', - action='store_true') - parser.add_argument('--profiler', '-p', - default=False, - required=False, - help='Enables code profiling mode using Python builtin profiler cProfile. ' - 'The stats will be dumped into a folder in .pipelinewise/profiling', - action='store_true' - ) + parser.add_argument( + '--extra_log', + default=False, + required=False, + help='Copy singer and fastsync logging into PipelineWise logger', + action='store_true', + ) + parser.add_argument( + '--debug', + default=False, + required=False, + help='Forces the debug mode with logging on stdout and log level debug', + action='store_true', + ) + parser.add_argument( + '--profiler', + '-p', + default=False, + required=False, + help='Enables code profiling mode using Python builtin profiler cProfile. ' + 'The stats will be dumped into a folder in .pipelinewise/profiling', + action='store_true', + ) args = parser.parse_args() @@ -201,7 +219,9 @@ def main(): # import_config : this is for backward compatibility; use 'import' instead from CLI if args.command == 'import' or args.command == 'import_config': if args.dir == '*': - print('You must specify a directory path with config YAML files using the argument --dir') + print( + 'You must specify a directory path with config YAML files using the argument --dir' + ) sys.exit(1) # Every command argument is mapped to a python function with the same name, but 'import' is a @@ -209,12 +229,16 @@ def main(): args.command = 'import_project' if args.command == 'validate' and args.dir == '*': - print('You must specify a directory path with config YAML files using the argument --dir') + print( + 'You must specify a directory path with config YAML files using the argument --dir' + ) sys.exit(1) if args.command == 'encrypt_string': if not args.secret: - print('You must specify a path to a file with vault secret using the argument --secret') + print( + 'You must specify a path to a file with vault secret using the argument --secret' + ) sys.exit(1) if not args.string: print('You must specify a string to encrypt using the argument --string') @@ -229,7 +253,9 @@ def main(): try: getattr(ppw_instance, args.command)() finally: - __disable_profiler(profiler, profiling_dir, f'pipelinewise_{args.command}', logger) + __disable_profiler( + profiler, profiling_dir, f'pipelinewise_{args.command}', logger + ) if __name__ == '__main__': diff --git a/pipelinewise/cli/alert_handlers/base_alert_handler.py b/pipelinewise/cli/alert_handlers/base_alert_handler.py index 91bdf6033..2e54d7a83 100644 --- a/pipelinewise/cli/alert_handlers/base_alert_handler.py +++ b/pipelinewise/cli/alert_handlers/base_alert_handler.py @@ -9,6 +9,7 @@ class BaseAlertHandler(ABC): """ Abstract base class for alert handlers """ + LOG = 'log' INFO = 'info' WARNING = 'warning' diff --git a/pipelinewise/cli/alert_handlers/errors.py b/pipelinewise/cli/alert_handlers/errors.py index fdc1841d9..15c2d7a11 100644 --- a/pipelinewise/cli/alert_handlers/errors.py +++ b/pipelinewise/cli/alert_handlers/errors.py @@ -7,6 +7,7 @@ class NotImplementedAlertHandlerException(Exception): """ Exception to raise when attempted to use a not implemented alert handler class """ + def __init__(self, *args, **kwargs): super().__init__(self, *args, **kwargs) @@ -15,6 +16,7 @@ class NotConfiguredAlertHandlerException(Exception): """ Exception to raise when attempted to use a not configured alert handler """ + def __init__(self, *args, **kwargs): super().__init__(self, *args, **kwargs) @@ -23,5 +25,6 @@ class InvalidAlertHandlerException(Exception): """ Exception to raise when alert handler not configured correctly """ + def __init__(self, *args, **kwargs): super().__init__(self, *args, **kwargs) diff --git a/pipelinewise/cli/alert_handlers/slack_alert_handler.py b/pipelinewise/cli/alert_handlers/slack_alert_handler.py index f15be1eec..81c604a26 100644 --- a/pipelinewise/cli/alert_handlers/slack_alert_handler.py +++ b/pipelinewise/cli/alert_handlers/slack_alert_handler.py @@ -11,7 +11,7 @@ BaseAlertHandler.LOG: '36C5F0', BaseAlertHandler.INFO: 'good', BaseAlertHandler.WARNING: 'warning', - BaseAlertHandler.ERROR: 'danger' + BaseAlertHandler.ERROR: 'danger', } @@ -20,6 +20,7 @@ class SlackAlertHandler(BaseAlertHandler): """ Slack Alert Handler class """ + def __init__(self, config: dict) -> None: if config is not None: if 'token' not in config: @@ -27,7 +28,9 @@ def __init__(self, config: dict) -> None: self.token = config['token'] if 'channel' not in config: - raise InvalidAlertHandlerException('Missing channel in Slack connection') + raise InvalidAlertHandlerException( + 'Missing channel in Slack connection' + ) self.channel = config['channel'] else: @@ -35,7 +38,9 @@ def __init__(self, config: dict) -> None: self.client = WebClient(self.token) - def send(self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception = None) -> None: + def send( + self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception = None + ) -> None: """ Send alert @@ -47,9 +52,15 @@ def send(self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception Returns: Initialised alert handler object """ - self.client.chat_postMessage(channel=self.channel, - text=f'```{exc}```' if exc else None, - attachments=[{ - 'color': ALERT_LEVEL_SLACK_COLORS.get(level, BaseAlertHandler.ERROR), - 'title': message - }]) + self.client.chat_postMessage( + channel=self.channel, + text=f'```{exc}```' if exc else None, + attachments=[ + { + 'color': ALERT_LEVEL_SLACK_COLORS.get( + level, BaseAlertHandler.ERROR + ), + 'title': message, + } + ], + ) diff --git a/pipelinewise/cli/alert_handlers/victorops_alert_handler.py b/pipelinewise/cli/alert_handlers/victorops_alert_handler.py index 0713bff60..92ba39fee 100644 --- a/pipelinewise/cli/alert_handlers/victorops_alert_handler.py +++ b/pipelinewise/cli/alert_handlers/victorops_alert_handler.py @@ -12,7 +12,7 @@ BaseAlertHandler.LOG: 'INFO', BaseAlertHandler.INFO: 'INFO', BaseAlertHandler.WARNING: 'WARNING', - BaseAlertHandler.ERROR: 'CRITICAL' + BaseAlertHandler.ERROR: 'CRITICAL', } @@ -21,20 +21,27 @@ class VictoropsAlertHandler(BaseAlertHandler): """ VictorOps Alert Handler class """ + def __init__(self, config: dict) -> None: if config is not None: if 'base_url' not in config: - raise InvalidAlertHandlerException('Missing REST Endpoint URL in VictorOps connection') + raise InvalidAlertHandlerException( + 'Missing REST Endpoint URL in VictorOps connection' + ) self.base_url = config['base_url'] if 'routing_key' not in config: - raise InvalidAlertHandlerException('Missing routing key in VictorOps connection') + raise InvalidAlertHandlerException( + 'Missing routing key in VictorOps connection' + ) self.routing_key = config['routing_key'] else: raise InvalidAlertHandlerException('No valid VictorOps config supplied.') - def send(self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception = None) -> None: + def send( + self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception = None + ) -> None: """ Send alert @@ -49,13 +56,22 @@ def send(self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception # Send alert to VictorOps REST Endpoint as a HTTP post request response = requests.post( f'{self.base_url}/{self.routing_key}', - data=json.dumps({ - 'message_type': ALERT_LEVEL_MESSAGE_TYPES.get(level, BaseAlertHandler.ERROR), - 'entity_display_name': message, - 'state_message': exc}), - headers={'Content-Type': 'application/json'}) + data=json.dumps( + { + 'message_type': ALERT_LEVEL_MESSAGE_TYPES.get( + level, BaseAlertHandler.ERROR + ), + 'entity_display_name': message, + 'state_message': exc, + } + ), + headers={'Content-Type': 'application/json'}, + ) # Success victorops message should return 200 if response.status_code != 200: - raise ValueError('Request to victorops returned an error {}. {}'.format(response.status_code, - response.text)) + raise ValueError( + 'Request to victorops returned an error {}. {}'.format( + response.status_code, response.text + ) + ) diff --git a/pipelinewise/cli/alert_sender.py b/pipelinewise/cli/alert_sender.py index b51c4208d..847f3f232 100644 --- a/pipelinewise/cli/alert_sender.py +++ b/pipelinewise/cli/alert_sender.py @@ -23,7 +23,7 @@ # Every alert handler class needs to implement the BaseAlertHandler base class ALERT_HANDLER_TYPES_TO_CLASS = { 'slack': SlackAlertHandler, - 'victorops': VictoropsAlertHandler + 'victorops': VictoropsAlertHandler, } @@ -39,13 +39,15 @@ class AlertSender: def __init__(self, alert_handlers: Dict = None) -> None: # Initialise alert_handlers as empty dictionary if None provided if not alert_handlers: - self.alert_handlers = dict() + self.alert_handlers = {} else: self.alert_handlers = alert_handlers # Raise an exception if alert_handlers is not a dictionary if not isinstance(self.alert_handlers, dict): - raise InvalidAlertHandlerException('alert_handlers needs to be a dictionary') + raise InvalidAlertHandlerException( + 'alert_handlers needs to be a dictionary' + ) @staticmethod def __init_handler_class(alert_handler: AlertHandler) -> BaseAlertHandler: @@ -63,8 +65,9 @@ def __init_handler_class(alert_handler: AlertHandler) -> BaseAlertHandler: alert_handler_class = ALERT_HANDLER_TYPES_TO_CLASS[alert_handler.type] handler = alert_handler_class(alert_handler.config) except KeyError as key_error: - raise NotImplementedAlertHandlerException(f'Alert handler type not implemented: {alert_handler.type}') \ - from key_error + raise NotImplementedAlertHandlerException( + f'Alert handler type not implemented: {alert_handler.type}' + ) from key_error return handler @@ -80,16 +83,22 @@ def __get_alert_handler(self, alert_handler_type: str) -> AlertHandler: """ if alert_handler_type in self.alert_handlers: alert_handler_config = self.alert_handlers[alert_handler_type] - alert_handler = AlertHandler(type=alert_handler_type, config=alert_handler_config) + alert_handler = AlertHandler( + type=alert_handler_type, config=alert_handler_config + ) return alert_handler - raise NotConfiguredAlertHandlerException(f'Alert handler type not configured: {alert_handler_type}') - - def send_to_handler(self, - alert_handler_type: str, - message: str, - level: str = BaseAlertHandler.ERROR, - exc: Exception = None) -> bool: + raise NotConfiguredAlertHandlerException( + f'Alert handler type not configured: {alert_handler_type}' + ) + + def send_to_handler( + self, + alert_handler_type: str, + message: str, + level: str = BaseAlertHandler.ERROR, + exc: Exception = None, + ) -> bool: """ Sends an alert message to a specific alert handler type @@ -112,10 +121,9 @@ def send_to_handler(self, # Alert sent successfully return True - def send_to_all_handlers(self, - message: str, - level: str = BaseAlertHandler.ERROR, - exc: Exception = None) -> dict: + def send_to_all_handlers( + self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception = None + ) -> dict: """ Get all the configured alert handlers and send alert message to all of them @@ -128,5 +136,8 @@ def send_to_all_handlers(self, Returns: Dictionary with number of successfully sent alerts """ - sents = [self.send_to_handler(handler_type, message, level, exc) for handler_type in self.alert_handlers] + sents = [ + self.send_to_handler(handler_type, message, level, exc) + for handler_type in self.alert_handlers + ] return {'sent': len(sents)} diff --git a/pipelinewise/cli/commands.py b/pipelinewise/cli/commands.py index 8fa0bf162..ed90bf6bd 100644 --- a/pipelinewise/cli/commands.py +++ b/pipelinewise/cli/commands.py @@ -4,8 +4,11 @@ import os import shlex import logging +import json +import time + +from dataclasses import dataclass from subprocess import PIPE, STDOUT, Popen -from collections import namedtuple from . import utils from .errors import StreamBufferTooLargeException @@ -16,21 +19,116 @@ MIN_STREAM_BUFFER_SIZE = 10 MAX_STREAM_BUFFER_SIZE = 2500 +PARAMS_VALIDATION_RETRY_PERIOD_SEC = 2 +PARAMS_VALIDATION_RETRY_TIMES = 3 + STATUS_RUNNING = 'running' STATUS_FAILED = 'failed' STATUS_SUCCESS = 'success' -TapParams = namedtuple('TapParams', ['id', 'type', 'bin', 'python_bin', 'config', 'properties', 'state']) -TargetParams = namedtuple('TargetParams', ['id', 'type', 'bin', 'python_bin', 'config']) -TransformParams = namedtuple('TransformParams', ['bin', 'python_bin', 'config', 'tap_id', 'target_id']) + +def _verify_json_file(json_file_path: str, file_must_exists: bool, allowed_empty: bool) -> bool: + """Checking if input file is a valid json or not, in some cases it is allowed to have an empty file, + or it is allowed file not exists! + """ + try: + with open(json_file_path, 'r', encoding='utf-8') as json_file: + json.load(json_file) + except FileNotFoundError: + return not file_must_exists + except json.decoder.JSONDecodeError: + if not allowed_empty or os.stat(json_file_path).st_size != 0: + return False + return True +def do_json_conf_validation(json_file: str, file_property: dict) -> bool: + """ + Validating a json format config property and retry if it is invalid + """ + for _ in range(PARAMS_VALIDATION_RETRY_TIMES): + if _verify_json_file(json_file_path=json_file, + file_must_exists=file_property['file_must_exists'], + allowed_empty=file_property['allowed_empty']): + return True + + time.sleep(PARAMS_VALIDATION_RETRY_PERIOD_SEC) + return False + + +@dataclass +class TapParams: + """ + TapParams validates json properties. + """ + tap_id: str + type: str + bin: str + python_bin: str + config: str + properties: str + state: str + + def __post_init__(self): + if not self.config: + raise RunCommandException( + f'Invalid json file for config: {self.config}') + + list_of_params_in_json_file = { + 'config': {'file_must_exists': True, 'allowed_empty': False}, + 'properties': {'file_must_exists': True, 'allowed_empty': False}, + 'state': {'file_must_exists': False, 'allowed_empty': True} + } + + for param, file_property in list_of_params_in_json_file.items(): + valid_json = do_json_conf_validation( + json_file=getattr(self, param, None), + file_property=file_property + ) if getattr(self, param, None) else True + + if not valid_json: + raise RunCommandException( + f'Invalid json file for {param}: {getattr(self, param, None)}') + + +@dataclass +class TargetParams: + """ + TargetParams validates json properties. + """ + target_id: str + type: str + bin: str + python_bin: str + config: str + + def __post_init__(self): + json_file = self.config + + valid_json = do_json_conf_validation( + json_file=json_file, + file_property={'file_must_exists': True, 'allowed_empty': False}) if json_file else False + + if not valid_json: + raise RunCommandException(f'Invalid json file for config: {self.config}') + + +@dataclass +class TransformParams: + """TransformParams.""" + bin: str + python_bin: str + config: str + tap_id: str + target_id: str + + +# pylint: disable=unnecessary-pass class RunCommandException(Exception): """ Custom exception to raise when run command fails """ - def __init__(self, *args, **kwargs): - Exception.__init__(self, *args, **kwargs) + pass def exists_and_executable(bin_path: str) -> bool: @@ -46,19 +144,20 @@ def exists_and_executable(bin_path: str) -> bool: boolean: True if file exists and executable, otherwise False """ + if not os.access(bin_path, os.X_OK): + try: paths = f"{os.environ['PATH']}".split(':') (p for p in paths if os.access(f'{p}/{bin_path}', os.X_OK)).__next__() except StopIteration: return False - return True -def build_tap_command(tap: TapParams, - profiling_mode: bool = False, - profiling_dir: str = None) -> str: +def build_tap_command( + tap: TapParams, profiling_mode: bool = False, profiling_dir: str = None +) -> str: """ Builds a command that starts a singer tap connector with the required command line arguments @@ -73,7 +172,9 @@ def build_tap_command(tap: TapParams, # Following the singer spec the catalog JSON file needs to be passed by the --catalog argument # However some tap (i.e. tap-mysql and tap-postgres) requires it as --properties # This is probably for historical reasons and need to clarify on Singer slack channels - catalog_argument = utils.get_tap_property_by_tap_type(tap.type, 'tap_catalog_argument') + catalog_argument = utils.get_tap_property_by_tap_type( + tap.type, 'tap_catalog_argument' + ) state_arg = '' if tap.state and os.path.isfile(tap.state): @@ -82,15 +183,15 @@ def build_tap_command(tap: TapParams, tap_command = f'{tap.bin} --config {tap.config} {catalog_argument} {tap.properties} {state_arg}' if profiling_mode: - dump_file = os.path.join(profiling_dir, f'tap_{tap.id}.pstat') + dump_file = os.path.join(profiling_dir, f'tap_{tap.tap_id}.pstat') tap_command = f'{tap.python_bin} -m cProfile -o {dump_file} {tap_command}' return tap_command -def build_target_command(target: TargetParams, - profiling_mode: bool = False, - profiling_dir: str = None) -> str: +def build_target_command( + target: TargetParams, profiling_mode: bool = False, profiling_dir: str = None +) -> str: """ Builds a command that starts a singer target connector with the required command line arguments @@ -106,15 +207,17 @@ def build_target_command(target: TargetParams, target_command = f'{target.bin} --config {target.config}' if profiling_mode: - dump_file = os.path.join(profiling_dir, f'target_{target.id}.pstat') - target_command = f'{target.python_bin} -m cProfile -o {dump_file} {target_command}' + dump_file = os.path.join(profiling_dir, f'target_{target.target_id}.pstat') + target_command = ( + f'{target.python_bin} -m cProfile -o {dump_file} {target_command}' + ) return target_command -def build_transformation_command(transform: TransformParams, - profiling_mode: bool = False, - profiling_dir: str = None) -> str: +def build_transformation_command( + transform: TransformParams, profiling_mode: bool = False, profiling_dir: str = None +) -> str: """ Builds a command that starts a singer transformation connector with the required command line arguments @@ -138,16 +241,21 @@ def build_transformation_command(transform: TransformParams, if profiling_mode: dump_file = os.path.join( profiling_dir, - f'transformation_{transform.tap_id}_{transform.target_id}.pstat') + f'transformation_{transform.tap_id}_{transform.target_id}.pstat', + ) - trans_command = f'{transform.python_bin} -m cProfile -o {dump_file} {trans_command}' + trans_command = ( + f'{transform.python_bin} -m cProfile -o {dump_file} {trans_command}' + ) return trans_command -def build_stream_buffer_command(buffer_size: int = 0, - log_file: str = None, - stream_buffer_bin: str = DEFAULT_STREAM_BUFFER_BIN) -> str: +def build_stream_buffer_command( + buffer_size: int = 0, + log_file: str = None, + stream_buffer_bin: str = DEFAULT_STREAM_BUFFER_BIN, +) -> str: """ Builds a command that buffers data between tap and target connectors to stream data asynchronously. Buffering streams @@ -188,11 +296,15 @@ def build_stream_buffer_command(buffer_size: int = 0, return buffer_command -def build_singer_command(tap: TapParams, target: TargetParams, transform: TransformParams, - stream_buffer_size: int = 0, - stream_buffer_log_file: str = None, - profiling_mode: bool = False, - profiling_dir: str = None) -> str: +def build_singer_command( + tap: TapParams, + target: TargetParams, + transform: TransformParams, + stream_buffer_size: int = 0, + stream_buffer_log_file: str = None, + profiling_mode: bool = False, + profiling_dir: str = None, +) -> str: """ Builds a command that starts a full singer command with tap, target and optional transformation connectors. The connectors are @@ -211,46 +323,49 @@ def build_singer_command(tap: TapParams, target: TargetParams, transform: Transf Returns: string of command line executable """ - tap_command = build_tap_command(tap, - profiling_mode, - profiling_dir) + tap_command = build_tap_command(tap, profiling_mode, profiling_dir) LOGGER.debug('Tap command: %s', tap_command) - target_command = build_target_command(target, - profiling_mode, - profiling_dir) + target_command = build_target_command(target, profiling_mode, profiling_dir) LOGGER.debug('Target command: %s', target_command) - transformation_command = build_transformation_command(transform, - profiling_mode, - profiling_dir) + transformation_command = build_transformation_command( + transform, profiling_mode, profiling_dir + ) LOGGER.debug('Transformation command: %s', transformation_command) - stream_buffer_command = build_stream_buffer_command(stream_buffer_size, - stream_buffer_log_file) + stream_buffer_command = build_stream_buffer_command( + stream_buffer_size, stream_buffer_log_file + ) LOGGER.debug('Buffer command: %s', stream_buffer_command) # Generate the final piped command with all the required components - sub_commands = [tap_command, transformation_command, stream_buffer_command, target_command] + sub_commands = [ + tap_command, + transformation_command, + stream_buffer_command, + target_command, + ] command = ' | '.join(list(filter(None, sub_commands))) return command # pylint: disable=too-many-arguments -def build_fastsync_command(tap: TapParams, - target: TargetParams, - transform: TransformParams, - venv_dir: str, - temp_dir: str, - tables: str = None, - profiling_mode: bool = False, - profiling_dir: str = None, - drop_pg_slot: bool = False - ) -> str: +def build_fastsync_command( + tap: TapParams, + target: TargetParams, + transform: TransformParams, + venv_dir: str, + temp_dir: str, + tables: str = None, + profiling_mode: bool = False, + profiling_dir: str = None, + drop_pg_slot: bool = False, +) -> str: """ Builds a command that starts fastsync from a given tap to a given target with optional transformations. @@ -273,21 +388,30 @@ def build_fastsync_command(tap: TapParams, fastsync_bin = utils.get_fastsync_bin(venv_dir, tap.type, target.type) ppw_python_bin = utils.get_pipelinewise_python_bin(venv_dir) - command_args = ' '.join(list(filter(None, [ - f'--tap {tap.config}', - f'--properties {tap.properties}', - f'--state {tap.state}', - f'--target {target.config}', - f'--temp_dir {temp_dir}', - f'--transform {transform.config}' if transform.config and os.path.isfile(transform.config) else '', - f'--tables {tables}' if tables else '', - '--drop_pg_slot' if drop_pg_slot else '', - ]))) + command_args = ' '.join( + list( + filter( + None, + [ + f'--tap {tap.config}', + f'--properties {tap.properties}', + f'--state {tap.state}', + f'--target {target.config}', + f'--temp_dir {temp_dir}', + f'--transform {transform.config}' + if transform.config and os.path.isfile(transform.config) + else '', + f'--tables {tables}' if tables else '', + '--drop_pg_slot' if drop_pg_slot else '', + ], + ) + ) + ) command = f'{fastsync_bin} {command_args}' if profiling_mode: - dump_file = os.path.join(profiling_dir, f'fastsync_{tap.id}_{target.id}.pstat') + dump_file = os.path.join(profiling_dir, f'fastsync_{tap.tap_id}_{target.target_id}.pstat') command = f'{ppw_python_bin} -m cProfile -o {dump_file} {command}' LOGGER.debug('FastSync command: %s', command) @@ -338,7 +462,7 @@ def run_command(command: str, log_file: str = None, line_callback: callable = No # Start command with Popen(shlex.split(piped_command), stdout=PIPE, stderr=STDOUT) as proc: - with open(log_file_running, 'a+') as logfile: + with open(log_file_running, 'a+', encoding='utf-8') as logfile: stdout = '' while True: line = proc.stdout.readline() @@ -362,9 +486,11 @@ def run_command(command: str, log_file: str = None, line_callback: callable = No # Raise run command exception errors = ''.join(utils.find_errors_in_log_file(log_file_failed)) - raise RunCommandException(f'Command failed. Return code: {proc_rc}\n' - f'Error(s) found:\n{errors}\n' - f'Full log: {log_file_failed}') + raise RunCommandException( + f'Command failed. Return code: {proc_rc}\n' + f'Error(s) found:\n{errors}\n' + f'Full log: {log_file_failed}' + ) # Add success status to the log file name os.rename(log_file_running, log_file_success) diff --git a/pipelinewise/cli/config.py b/pipelinewise/cli/config.py index 7ba093fb2..5e31ce259 100644 --- a/pipelinewise/cli/config.py +++ b/pipelinewise/cli/config.py @@ -6,6 +6,8 @@ import sys import json +from typing import Dict, List + from pipelinewise.utils import safe_column_name from . import utils @@ -22,8 +24,8 @@ def __init__(self, config_dir): self.logger = logging.getLogger(__name__) self.config_dir = config_dir self.config_path = os.path.join(self.config_dir, 'config.json') - self.global_config = dict() - self.targets = dict() + self.global_config = {} + self.targets = {} @classmethod # pylint: disable=too-many-locals @@ -54,10 +56,13 @@ def from_yamls(cls, config_dir, yaml_dir='.', vault_secret=None): utils.validate(instance=global_config, schema=global_config_schema) config.global_config = global_config or {} + # pylint: disable=E1136,E1137 # False positive when loading vault encrypted YAML # Load every target yaml into targets dictionary for yaml_file in target_yamls: config.logger.info('LOADING TARGET: %s', yaml_file) - target_data = utils.load_yaml(os.path.join(yaml_dir, yaml_file), vault_secret) + target_data = utils.load_yaml( + os.path.join(yaml_dir, yaml_file), vault_secret + ) utils.validate(instance=target_data, schema=target_schema) # Add generated extra keys that not available in the YAML @@ -69,7 +74,9 @@ def from_yamls(cls, config_dir, yaml_dir='.', vault_secret=None): config.logger.error('Duplicate target found "%s"', target_id) sys.exit(1) - target_data['files'] = config.get_connector_files(config.get_target_dir(target_id)) + target_data['files'] = config.get_connector_files( + config.get_target_dir(target_id) + ) target_data['taps'] = [] # Add target to list @@ -91,19 +98,26 @@ def from_yamls(cls, config_dir, yaml_dir='.', vault_secret=None): target_id = tap_data['target'] if target_id not in targets: - config.logger.error("Can't find the target with the ID \"%s\" but it's referenced in %s", target_id, - yaml_file) + config.logger.error( + "Can't find the target with the ID \"%s\" but it's referenced in %s", + target_id, + yaml_file, + ) sys.exit(1) # Add generated extra keys that not available in the YAML - tap_data['files'] = config.get_connector_files(config.get_tap_dir(target_id, tap_id)) + tap_data['files'] = config.get_connector_files( + config.get_tap_dir(target_id, tap_id) + ) # Add tap to list taps[tap_id] = tap_data # Link taps to targets - for target_key in targets: - targets[target_key]['taps'] = [tap for tap in taps.values() if tap['target'] == target_key] + for target_key, target in targets.items(): + target['taps'] = [ + tap for tap in taps.values() if tap['target'] == target_key + ] # Final structure is ready config.targets = targets @@ -129,17 +143,20 @@ def get_tap_dir(self, target_id, tap_id): return os.path.join(self.config_dir, target_id, tap_id) @staticmethod - def get_connector_files(connector_dir): + def get_connector_files(connector_dir: str) -> Dict: """ Returns the absolute paths of a tap/target configuration files """ return { 'config': os.path.join(connector_dir, 'config.json'), - 'inheritable_config': os.path.join(connector_dir, 'inheritable_config.json'), + 'inheritable_config': os.path.join( + connector_dir, 'inheritable_config.json' + ), 'properties': os.path.join(connector_dir, 'properties.json'), 'state': os.path.join(connector_dir, 'state.json'), 'transformation': os.path.join(connector_dir, 'transformation.json'), 'selection': os.path.join(connector_dir, 'selection.json'), + 'pidfile': os.path.join(connector_dir, 'pipelinewise.pid'), } def save(self): @@ -159,7 +176,9 @@ def save(self): # Save every tap JSON files for tap in target['taps']: - extra_config_keys = utils.get_tap_extra_config_keys(tap, self.get_temp_dir()) + extra_config_keys = utils.get_tap_extra_config_keys( + tap, self.get_temp_dir() + ) self.save_tap_jsons(target, tap, extra_config_keys) def save_main_config_json(self): @@ -173,26 +192,31 @@ def save_main_config_json(self): targets = [] # Generate dictionary for config.json - for key in self.targets: + for target_tuple in self.targets.items(): + target = target_tuple[1] taps = [] - for tap in self.targets[key].get('taps'): - taps.append({ - 'id': tap.get('id'), - 'name': tap.get('name'), - 'type': tap.get('type'), - 'owner': tap.get('owner'), - 'stream_buffer_size': tap.get('stream_buffer_size'), - 'send_alert': tap.get('send_alert', True), - 'enabled': True - }) - - targets.append({ - 'id': self.targets[key].get('id'), - 'name': self.targets[key].get('name'), - 'status': 'ready', - 'type': self.targets[key].get('type'), - 'taps': taps - }) + for tap in target.get('taps'): + taps.append( + { + 'id': tap.get('id'), + 'name': tap.get('name'), + 'type': tap.get('type'), + 'owner': tap.get('owner'), + 'stream_buffer_size': tap.get('stream_buffer_size'), + 'send_alert': tap.get('send_alert', True), + 'enabled': True, + } + ) + + targets.append( + { + 'id': target.get('id'), + 'name': target.get('name'), + 'status': 'ready', + 'type': target.get('type'), + 'taps': taps, + } + ) main_config = {**self.global_config, **{'targets': targets}} # Create config dir if not exists @@ -223,12 +247,9 @@ def save_tap_jsons(self, target, tap, extra_config_keys=None): """ Generating JSON config files for a singer tap connector: 1. config.json :(Singer spec): Tap connection details - 2. properties.json :(Singer spec): Tap schema properties (generated) - 3. state.json :(Singer spec): Bookmark for incremental and log_based - replications 4. selection.json :(Pipelinewise): List of streams/tables to replicate - 5. inheritabel_config.json :(Pipelinewise): Extra config keys for the linked + 5. inheritable_config.json :(Pipelinewise): Extra config keys for the linked singer target connector that pipelinewise will pass at run time 6. transformation.json :(Pipelinewise): Column transformations between the @@ -236,6 +257,19 @@ def save_tap_jsons(self, target, tap, extra_config_keys=None): """ if extra_config_keys is None: extra_config_keys = {} + + # Generate tap config dict + tap_config = self.generate_tap_connection_config(tap, extra_config_keys) + + # Generate tap selection + tap_selection = {'selection': self.generate_selection(tap)} + + # Generate tap transformation + tap_transformation = {'transformations': self.generate_transformations(tap)} + + # Generate tap inheritable_config dict + tap_inheritable_config = self.generate_inheritable_config(tap) + tap_dir = self.get_tap_dir(target.get('id'), tap.get('id')) self.logger.info('SAVING TAP JSONS to %s', tap_dir) @@ -249,58 +283,109 @@ def save_tap_jsons(self, target, tap, extra_config_keys=None): if not os.path.exists(tap_dir): os.mkdir(tap_dir) - # Generate tap config dict: a merged dictionary of db_connection and optional extra_keys - tap_config = {**tap.get('db_conn'), **extra_config_keys} + # Save the generated JSON files + utils.save_json(tap_config, tap_config_path) + utils.save_json(tap_inheritable_config, tap_inheritable_config_path) + utils.save_json(tap_transformation, tap_transformation_path) + utils.save_json(tap_selection, tap_selection_path) - # Get additional properties will be needed later to generate tap_stream_id - tap_dbname = tap_config.get('dbname') + @classmethod + def generate_tap_connection_config(cls, tap: Dict, extra_config_keys: Dict) -> Dict: + """ + Generate tap connection config which is a merged dictionary of db_connection and optional extra_keys + Args: + tap: tap config + extra_config_keys: extra keys to add to the db conn config + Returns: Dictionary of tap connection config + """ + return {**tap.get('db_conn'), **extra_config_keys} - # Generate tap selection + @classmethod + def generate_selection(cls, tap: Dict) -> List[Dict]: + """ + Generate the selection data which is the list of selected streams and their replication method + Args: + tap: the tap config dictionary + + Returns: List of dictionaries of selected streams + """ selection = [] + for schema in tap.get('schemas', []): schema_name = schema.get('source_schema') for table in schema.get('tables', []): table_name = table.get('table_name') - replication_method = table.get('replication_method', utils.get_tap_default_replication_method(tap)) - selection.append(utils.delete_empty_keys({ - 'tap_stream_id': utils.get_tap_stream_id(tap, tap_dbname, schema_name, table_name), - 'replication_method': replication_method, + replication_method = table.get( + 'replication_method', utils.get_tap_default_replication_method(tap) + ) + selection.append( + utils.delete_empty_keys( + { + 'tap_stream_id': utils.get_tap_stream_id( + tap, tap['db_conn'].get('dbname'), schema_name, table_name + ), + 'replication_method': replication_method, + # Add replication_key only if replication_method is INCREMENTAL + 'replication_key': table.get('replication_key') + if replication_method == 'INCREMENTAL' else None, + } + ) + ) + + return selection - # Add replication_key only if replication_method is INCREMENTAL - 'replication_key': table.get('replication_key') if replication_method == 'INCREMENTAL' else None - })) - tap_selection = {'selection': selection} + @classmethod + def generate_transformations(cls, tap: Dict) -> List[Dict]: + """ + Generate the transformations data from the given tap config + Args: + tap: the tap config dictionary - # Generate tap transformation + Returns: List of transformations + """ transformations = [] + for schema in tap.get('schemas', []): schema_name = schema.get('source_schema') for table in schema.get('tables', []): table_name = table.get('table_name') for trans in table.get('transformations', []): - transformations.append({ - 'tap_stream_name': utils.get_tap_stream_name(tap, tap_dbname, schema_name, table_name), - 'field_id': trans['column'], - # Make column name safe by wrapping it in quotes, it's useful when a field_id is a reserved word - # to be used by target snowflake in fastsync - 'safe_field_id': safe_column_name(trans['column']), - 'type': trans['type'], - 'when': trans.get('when') - }) - tap_transformation = { - 'transformations': transformations - } + transformations.append( + { + 'tap_stream_name': utils.get_tap_stream_name( + tap, tap['db_conn'].get('dbname'), schema_name, table_name), + 'field_id': trans['column'], + # Make column name safe by wrapping it in quotes, it's useful when a field_id is a reserved + # word to be used by target snowflake in fastsync + 'safe_field_id': safe_column_name(trans['column']), + 'field_paths': trans.get('field_paths'), + 'type': trans['type'], + 'when': trans.get('when'), + } + ) + + return transformations + + def generate_inheritable_config(self, tap: Dict) -> Dict: + """ + Generate the inheritable config which is the custom config that should be fed to the target at runtime + Args: + tap: tap config - # Generate stream to schema mapping + Returns: Dictionary of config + """ schema_mapping = {} + for schema in tap.get('schemas', []): source_schema = schema.get('source_schema') target_schema = schema.get('target_schema') - target_schema_select_perms = schema.get('target_schema_select_permissions', []) + target_schema_select_perms = schema.get( + 'target_schema_select_permissions', [] + ) schema_mapping[source_schema] = { 'target_schema': target_schema, - 'target_schema_select_permissions': target_schema_select_perms + 'target_schema_select_permissions': target_schema_select_perms, } # Schema mapping can include list of indices to create. Some target components @@ -317,56 +402,63 @@ def save_tap_jsons(self, target, tap, extra_config_keys=None): schema_mapping[source_schema]['indices'] = indices # Generate tap inheritable_config dict - tap_inheritable_config = utils.delete_empty_keys({ - 'temp_dir': self.get_temp_dir(), - 'tap_id': tap.get('id'), - 'query_tag': json.dumps({ - 'ppw_component': tap.get('type'), + tap_inheritable_config = utils.delete_empty_keys( + { + 'temp_dir': self.get_temp_dir(), 'tap_id': tap.get('id'), - 'database': '{{database}}', - 'schema': '{{schema}}', - 'table': '{{table}}' - }), - 'batch_size_rows': tap.get('batch_size_rows', 20000), - 'batch_wait_limit_seconds': tap.get('batch_wait_limit_seconds', None), - 'parallelism': tap.get('parallelism', 0), - 'parallelism_max': tap.get('parallelism_max', 4), - 'hard_delete': tap.get('hard_delete', True), - 'flush_all_streams': tap.get('flush_all_streams', False), - 'primary_key_required': tap.get('primary_key_required', True), - 'default_target_schema': tap.get('default_target_schema'), - 'default_target_schema_select_permissions': tap.get('default_target_schema_select_permissions'), - 'schema_mapping': schema_mapping, - - # data_flattening_max_level - # ------------------------- - # - # 'data_flattening_max_level' is an optional parameter in some target connectors that specifies - # how to load nested object into destination. - # - # We can load the original object represented as JSON or string (data flattening off) or we can - # flatten the schema and data by creating columns automatically. When 'data_flattening_max_level' - # is set to 0 then flattening functionality is turned off. - # - #  The value can be set in mutliple place and evaluated in the following order: - # ------------ - # 1: First we try to find it in the tap YAML - # 2: Second we try to get the tap type specific default value - # 3: Otherwise we set flattening level to 0 (disabled) - 'data_flattening_max_level': tap.get('data_flattening_max_level', - utils.get_tap_property(tap, 'default_data_flattening_max_level') or 0), - 'validate_records': tap.get('validate_records', False), - 'add_metadata_columns': tap.get('add_metadata_columns', False), - 'split_large_files': tap.get('split_large_files', False), - 'split_file_chunk_size_mb': tap.get('split_file_chunk_size_mb', 1000), - 'split_file_max_chunks': tap.get('split_file_max_chunks', 20), - 'archive_load_files': tap.get('archive_load_files', False), - 'archive_load_files_s3_bucket': tap.get('archive_load_files_s3_bucket', None), - 'archive_load_files_s3_prefix': tap.get('archive_load_files_s3_prefix', None) - }) + 'query_tag': json.dumps( + { + 'ppw_component': tap.get('type'), + 'tap_id': tap.get('id'), + 'database': '{{database}}', + 'schema': '{{schema}}', + 'table': '{{table}}', + } + ), + 'batch_size_rows': tap.get('batch_size_rows', 20000), + 'batch_wait_limit_seconds': tap.get('batch_wait_limit_seconds', None), + 'parallelism': tap.get('parallelism', 0), + 'parallelism_max': tap.get('parallelism_max', 4), + 'hard_delete': tap.get('hard_delete', True), + 'flush_all_streams': tap.get('flush_all_streams', False), + 'primary_key_required': tap.get('primary_key_required', True), + 'default_target_schema': tap.get('default_target_schema'), + 'default_target_schema_select_permissions': tap.get( + 'default_target_schema_select_permissions' + ), + 'schema_mapping': schema_mapping, + # data_flattening_max_level + # ------------------------- + # + # 'data_flattening_max_level' is an optional parameter in some target connectors that specifies + # how to load nested object into destination. + # + # We can load the original object represented as JSON or string (data flattening off) or we can + # flatten the schema and data by creating columns automatically. When 'data_flattening_max_level' + # is set to 0 then flattening functionality is turned off. + # + # The value can be set in multiple place and evaluated in the following order: + # ------------ + # 1: First we try to find it in the tap YAML + # 2: Second we try to get the tap type specific default value + # 3: Otherwise we set flattening level to 0 (disabled) + 'data_flattening_max_level': tap.get( + 'data_flattening_max_level', + utils.get_tap_property(tap, 'default_data_flattening_max_level') or 0, + ), + 'validate_records': tap.get('validate_records', False), + 'add_metadata_columns': tap.get('add_metadata_columns', False), + 'split_large_files': tap.get('split_large_files', False), + 'split_file_chunk_size_mb': tap.get('split_file_chunk_size_mb', 1000), + 'split_file_max_chunks': tap.get('split_file_max_chunks', 20), + 'archive_load_files': tap.get('archive_load_files', False), + 'archive_load_files_s3_bucket': tap.get( + 'archive_load_files_s3_bucket', None + ), + 'archive_load_files_s3_prefix': tap.get( + 'archive_load_files_s3_prefix', None + ), + } + ) - # Save the generated JSON files - utils.save_json(tap_config, tap_config_path) - utils.save_json(tap_inheritable_config, tap_inheritable_config_path) - utils.save_json(tap_transformation, tap_transformation_path) - utils.save_json(tap_selection, tap_selection_path) + return tap_inheritable_config diff --git a/pipelinewise/cli/constants.py b/pipelinewise/cli/constants.py new file mode 100644 index 000000000..d9bab0031 --- /dev/null +++ b/pipelinewise/cli/constants.py @@ -0,0 +1,34 @@ +import enum + + +class ConnectorType(enum.Enum): + """ + Enums for various Singer connector type names + """ + + TAP_ADWORDS = 'tap-adwords' + TAP_GITHUB = 'tap-github' + TAP_GOOGLE_ANALYTICS = 'tap-google-analytics' + TAP_JIRA = 'tap-jira' + TAP_KAFKA = 'tap-kafka' + TAP_MIXPANEL = 'tap-mixpanel' + TAP_MONGODB = 'tap-mongodb' + TAP_MYSQL = 'tap-mysql' + TAP_ORACLE = 'tap-oracle' + TAP_POSTGRES = 'tap-postgres' + TAP_S3_CSV = 'tap-s3-csv' + TAP_SALESFORCE = 'tap-salesforce' + TAP_SHOPIFY = 'tap-shopify' + TAP_SLACK = 'tap-slack' + TAP_SNOWFLAKE = 'tap-snowflake' + TAP_TWILIO = 'tap-twilio' + TAP_ZENDESK = 'tap-zendesk' + TAP_ZUORA = 'tap-zuora' + + TARGET_BIGQUERY = 'target-bigquery' + TARGET_POSTGRES = 'target-postgres' + TARGET_SNOWFLAKE = 'target-snowflake' + TARGET_REDSHIFT = 'target-redshift' + TARGET_S3_CSV = 'target-s3-csv' + + TRANSFORM_FIELD = 'transform-field' diff --git a/pipelinewise/cli/errors.py b/pipelinewise/cli/errors.py index 88a93d623..08df9bb55 100644 --- a/pipelinewise/cli/errors.py +++ b/pipelinewise/cli/errors.py @@ -10,6 +10,20 @@ class StreamBufferTooLargeException(Exception): """Raised if stream buffer size is greater than the max allowed size""" def __init__(self, buffer_size, max_buffer_size): - msg = f'{buffer_size}M buffer size is too large. The maximum allowed stream buffer size is ' \ - f'{max_buffer_size}M' + msg = ( + f'{buffer_size}M buffer size is too large. The maximum allowed stream buffer size is ' + f'{max_buffer_size}M' + ) super().__init__(msg) + + +class InvalidTransformationException(Exception): + """Raised if invalid transformation config is found""" + + +class DuplicateConfigException(Exception): + """Raised if a duplicate tap/target config is found""" + + +class InvalidConfigException(Exception): + """Raised if an invalid config is found""" diff --git a/pipelinewise/cli/pipelinewise.py b/pipelinewise/cli/pipelinewise.py index 5db67826f..c02e5beb0 100644 --- a/pipelinewise/cli/pipelinewise.py +++ b/pipelinewise/cli/pipelinewise.py @@ -13,16 +13,44 @@ from datetime import datetime from time import time -from typing import Dict, Optional, List +from typing import Dict, Optional, List, Any from joblib import Parallel, delayed, parallel_backend from tabulate import tabulate from . import utils +from .constants import ConnectorType from . import commands from .commands import TapParams, TargetParams, TransformParams from .config import Config from .alert_sender import AlertSender from .alert_handlers.base_alert_handler import BaseAlertHandler +from .errors import InvalidTransformationException, DuplicateConfigException, InvalidConfigException + +FASTSYNC_PAIRS = { + ConnectorType.TAP_MYSQL: { + ConnectorType.TARGET_SNOWFLAKE, + ConnectorType.TARGET_REDSHIFT, + ConnectorType.TARGET_POSTGRES, + ConnectorType.TARGET_BIGQUERY, + }, + ConnectorType.TAP_POSTGRES: { + ConnectorType.TARGET_SNOWFLAKE, + ConnectorType.TARGET_REDSHIFT, + ConnectorType.TARGET_POSTGRES, + ConnectorType.TARGET_BIGQUERY, + }, + ConnectorType.TAP_S3_CSV: { + ConnectorType.TARGET_SNOWFLAKE, + ConnectorType.TARGET_REDSHIFT, + ConnectorType.TARGET_POSTGRES, + ConnectorType.TARGET_BIGQUERY, + }, + ConnectorType.TAP_MONGODB: { + ConnectorType.TARGET_SNOWFLAKE, + ConnectorType.TARGET_POSTGRES, + ConnectorType.TARGET_BIGQUERY, + }, +} # pylint: disable=too-many-lines,too-many-instance-attributes,too-many-public-methods @@ -46,7 +74,9 @@ def __init__(self, args, config_dir, venv_dir, profiling_dir=None): self.config_dir = config_dir self.venv_dir = venv_dir self.extra_log = args.extra_log - self.pipelinewise_bin = os.path.join(self.venv_dir, 'cli', 'bin', 'pipelinewise') + self.pipelinewise_bin = os.path.join( + self.venv_dir, 'cli', 'bin', 'pipelinewise' + ) self.config_path = os.path.join(self.config_dir, 'config.json') self.load_config() self.alert_sender = AlertSender(self.config.get('alert_handlers')) @@ -61,18 +91,21 @@ def __init__(self, args, config_dir, venv_dir, profiling_dir=None): self.target_bin = self.get_connector_bin(self.target['type']) self.target_python_bin = self.get_connector_python_bin(self.target['type']) - self.transform_field_bin = self.get_connector_bin(self.TRANSFORM_FIELD_CONNECTOR_NAME) - self.transform_field_python_bin = self.get_connector_python_bin(self.TRANSFORM_FIELD_CONNECTOR_NAME) + self.transform_field_bin = self.get_connector_bin( + self.TRANSFORM_FIELD_CONNECTOR_NAME + ) + self.transform_field_python_bin = self.get_connector_python_bin( + self.TRANSFORM_FIELD_CONNECTOR_NAME + ) self.tap_run_log_file = None # Catch SIGINT and SIGTERM to exit gracefully for sig in [signal.SIGINT, signal.SIGTERM]: - signal.signal(sig, self._exit_gracefully) + signal.signal(sig, self.stop_tap) - def send_alert(self, - message: str, - level: str = BaseAlertHandler.ERROR, - exc: Exception = None) -> dict: + def send_alert( + self, message: str, level: str = BaseAlertHandler.ERROR, exc: Exception = None + ) -> dict: """ Send alert messages to every alert handler if sender is not disabled for the tap @@ -88,7 +121,9 @@ def send_alert(self, send_alert = self.tap.get('send_alert', True) if send_alert: - stats = self.alert_sender.send_to_all_handlers(message=message, level=level, exc=exc) + stats = self.alert_sender.send_to_all_handlers( + message=message, level=level, exc=exc + ) return stats @@ -105,18 +140,27 @@ def create_consumable_target_config(self, target_config, tap_inheritable_config) dict_a.update(dict_b) # Save the new dict as JSON into a temp file - tempfile_path = utils.create_temp_file(dir=self.get_temp_dir(), - prefix='target_config_', - suffix='.json')[1] + tempfile_path = utils.create_temp_file( + dir=self.get_temp_dir(), prefix='target_config_', suffix='.json' + )[1] utils.save_json(dict_a, tempfile_path) return tempfile_path except Exception as exc: - raise Exception(f'Cannot merge JSON files {dict_a} {dict_b} - {exc}') from exc + raise Exception( + f'Cannot merge JSON files {dict_a} {dict_b} - {exc}' + ) from exc # pylint: disable=too-many-statements,too-many-branches,too-many-nested-blocks,too-many-locals,too-many-arguments - def create_filtered_tap_properties(self, target_type, tap_type, tap_properties, tap_state, filters, - create_fallback=False): + def create_filtered_tap_properties( + self, + target_type: ConnectorType, + tap_type: ConnectorType, + tap_properties: str, + tap_state: str, + filters: Dict[str, Any], + create_fallback=False, + ): """ Create a filtered version of tap properties file based on specific filter conditions. @@ -132,11 +176,10 @@ def create_filtered_tap_properties(self, target_type, tap_type, tap_properties, """ # Get filter conditions with default values from input dictionary # Nothing selected by default - f_selected = filters.get('selected', None) - f_target_type = filters.get('target_type', None) - f_tap_type = filters.get('tap_type', None) + f_selected: bool = filters.get('selected', False) + f_tap_target_pairs: Dict = filters.get('tap_target_pairs', {}) f_replication_method = filters.get('replication_method', None) - f_initial_sync_required = filters.get('initial_sync_required', None) + f_initial_sync_required: bool = filters.get('initial_sync_required', False) # Lists of tables that meet and don't meet the filter criteria filtered_tap_stream_ids = [] @@ -152,7 +195,9 @@ def create_filtered_tap_properties(self, target_type, tap_type, tap_properties, fallback_properties = copy.deepcopy(properties) if create_fallback else {} # Foreach stream (table) in the original properties - for stream_idx, stream in enumerate(properties.get('streams', tap_properties)): + for stream_idx, stream in enumerate( + properties.get('streams', tap_properties) + ): initial_sync_required = False # Collect required properties from the properties file @@ -171,7 +216,11 @@ def create_filtered_tap_properties(self, target_type, tap_type, tap_properties, # Can we make sure that the stream has the right metadata? # To be safe, check if no right metadata has been found, then throw an exception. if not table_meta: - self.logger.error('Stream %s has no metadata with no breadcrumbs: %s.', tap_stream_id, metadata) + self.logger.error( + 'Stream %s has no metadata with no breadcrumbs: %s.', + tap_stream_id, + metadata, + ) raise Exception(f'Missing metadata in stream {tap_stream_id}') selected = table_meta.get('selected', False) @@ -180,7 +229,9 @@ def create_filtered_tap_properties(self, target_type, tap_type, tap_properties, # Detect if initial sync is required. Look into the state file, get the bookmark # for the current stream (table) and if valid bookmark doesn't exist then # initial sync is required - bookmarks = state.get('bookmarks', {}) if isinstance(state, dict) else {} + bookmarks = ( + state.get('bookmarks', {}) if isinstance(state, dict) else {} + ) new_stream = False @@ -191,7 +242,9 @@ def create_filtered_tap_properties(self, target_type, tap_type, tap_properties, else: stream_bookmark = bookmarks[tap_stream_id] - if self._is_initial_sync_required(replication_method, stream_bookmark): + if self._is_initial_sync_required( + replication_method, stream_bookmark + ): initial_sync_required = True # Compare actual values to the filter conditions. @@ -199,22 +252,39 @@ def create_filtered_tap_properties(self, target_type, tap_type, tap_properties, # Set the "selected" key to False if the actual values don't meet the filter criteria # pylint: disable=too-many-boolean-expressions if ( - (f_selected is None or selected == f_selected) and - (f_target_type is None or target_type in f_target_type) and - (f_tap_type is None or tap_type in f_tap_type) and - (f_replication_method is None or replication_method in f_replication_method) and - (f_initial_sync_required is None or initial_sync_required == f_initial_sync_required) + (f_selected is None or selected == f_selected) + and ( + f_tap_target_pairs is None + or target_type in f_tap_target_pairs.get(tap_type, set()) + ) + and ( + f_replication_method is None + or replication_method in f_replication_method + ) + and ( + f_initial_sync_required is None + or initial_sync_required == f_initial_sync_required + ) ): - self.logger.debug("""Filter condition(s) matched: + self.logger.debug( + """Filter condition(s) matched: Table : %s Tap Stream ID : %s Selected : %s Replication Method : %s Init Sync Required : %s - """, table_name, tap_stream_id, selected, replication_method, initial_sync_required) + """, + table_name, + tap_stream_id, + selected, + replication_method, + initial_sync_required, + ) # Filter condition matched: mark table as selected to sync - properties['streams'][stream_idx]['metadata'][meta_idx]['metadata']['selected'] = True + properties['streams'][stream_idx]['metadata'][meta_idx]['metadata'][ + 'selected' + ] = True filtered_tap_stream_ids.append(tap_stream_id) # Filter condition matched: @@ -222,47 +292,58 @@ def create_filtered_tap_properties(self, target_type, tap_type, tap_properties, # the fallback properties as well if the table is selected in the original properties. # Otherwise, mark it as not selected if create_fallback: - if new_stream and replication_method in [self.INCREMENTAL, self.LOG_BASED]: - fallback_properties['streams'][stream_idx]['metadata'][meta_idx]['metadata'][ - 'selected'] = True + if new_stream and replication_method in [ + self.INCREMENTAL, + self.LOG_BASED, + ]: + fallback_properties['streams'][stream_idx]['metadata'][ + meta_idx + ]['metadata']['selected'] = True if selected: fallback_filtered_stream_ids.append(tap_stream_id) else: - fallback_properties['streams'][stream_idx]['metadata'][meta_idx]['metadata'][ - 'selected'] = False + fallback_properties['streams'][stream_idx]['metadata'][ + meta_idx + ]['metadata']['selected'] = False else: # Filter condition didn't match: mark table as not selected to sync - properties['streams'][stream_idx]['metadata'][meta_idx]['metadata']['selected'] = False + properties['streams'][stream_idx]['metadata'][meta_idx]['metadata'][ + 'selected' + ] = False # Filter condition didn't match: mark table as selected to sync in the fallback properties # Fallback only if the table is selected in the original properties if create_fallback and selected is True: - fallback_properties['streams'][stream_idx]['metadata'][meta_idx]['metadata']['selected'] = True + fallback_properties['streams'][stream_idx]['metadata'][ + meta_idx + ]['metadata']['selected'] = True fallback_filtered_stream_ids.append(tap_stream_id) # Save the generated properties file(s) and return # Fallback required: Save filtered and fallback properties JSON if create_fallback: # Save to files: filtered and fallback properties - temp_properties_path = utils.create_temp_file(dir=self.get_temp_dir(), - prefix='properties_', - suffix='.json')[1] + temp_properties_path = utils.create_temp_file( + dir=self.get_temp_dir(), prefix='properties_', suffix='.json' + )[1] utils.save_json(properties, temp_properties_path) - temp_fallback_properties_path = utils.create_temp_file(dir=self.get_temp_dir(), - prefix='properties_', - suffix='.json')[1] + temp_fallback_properties_path = utils.create_temp_file( + dir=self.get_temp_dir(), prefix='properties_', suffix='.json' + )[1] utils.save_json(fallback_properties, temp_fallback_properties_path) - return temp_properties_path, \ - filtered_tap_stream_ids, \ - temp_fallback_properties_path, \ - fallback_filtered_stream_ids + return ( + temp_properties_path, + filtered_tap_stream_ids, + temp_fallback_properties_path, + fallback_filtered_stream_ids, + ) # Fallback not required: Save only the filtered properties JSON - temp_properties_path = utils.create_temp_file(dir=self.get_temp_dir(), - prefix='properties_', - suffix='.json')[1] + temp_properties_path = utils.create_temp_file( + dir=self.get_temp_dir(), prefix='properties_', suffix='.json' + )[1] utils.save_json(properties, temp_properties_path) return temp_properties_path, filtered_tap_stream_ids @@ -318,21 +399,6 @@ def get_connector_python_bin(self, connector_type): """ return os.path.join(self.venv_dir, connector_type, 'bin', 'python') - @classmethod - def get_connector_files(cls, connector_dir): - """ - Get connector file paths - """ - return { - 'config': os.path.join(connector_dir, 'config.json'), - 'inheritable_config': os.path.join(connector_dir, 'inheritable_config.json'), - 'properties': os.path.join(connector_dir, 'properties.json'), - 'state': os.path.join(connector_dir, 'state.json'), - 'transformation': os.path.join(connector_dir, 'transformation.json'), - 'selection': os.path.join(connector_dir, 'selection.json'), - 'pidfile': os.path.join(connector_dir, 'pipelinewise.pid') - } - def get_targets(self): """ Get every target @@ -353,14 +419,14 @@ def get_target(self, target_id: str) -> Dict: self.logger.debug('Getting %s target', target_id) targets = self.get_targets() - target = next((item for item in targets if item['id'] == target_id), False) + target = next((item for item in targets if item['id'] == target_id), None) if not target: raise Exception(f'Cannot find {target_id} target') target_dir = self.get_target_dir(target_id) if os.path.isdir(target_dir): - target['files'] = self.get_connector_files(target_dir) + target['files'] = Config.get_connector_files(target_dir) else: raise Exception(f'Cannot find target at {target_dir}') @@ -385,21 +451,21 @@ def get_taps(self, target_id): return taps - def get_tap(self, target_id, tap_id): + def get_tap(self, target_id: str, tap_id: str) -> Dict: """ Get tap by id from a specific target """ self.logger.debug('Getting %s tap from target %s', tap_id, target_id) taps = self.get_taps(target_id) - tap = next((item for item in taps if item['id'] == tap_id), False) + tap = next((item for item in taps if item['id'] == tap_id), None) if not tap: raise Exception(f'Cannot find {tap_id} tap in {target_id} target') tap_dir = self.get_tap_dir(target_id, tap_id) if os.path.isdir(tap_dir): - tap['files'] = self.get_connector_files(tap_dir) + tap['files'] = Config.get_connector_files(tap_dir) else: raise Exception(f'Cannot find tap at {tap_dir}') @@ -409,8 +475,9 @@ def get_tap(self, target_id, tap_id): return tap + # TODO: This method is too complex! make its complexity less than 15! # pylint: disable=too-many-branches,too-many-statements,too-many-nested-blocks,too-many-locals - def merge_schemas(self, old_schema, new_schema): + def merge_schemas(self, old_schema, new_schema): # noqa: C901 """ Merge two schemas """ @@ -424,7 +491,14 @@ def merge_schemas(self, old_schema, new_schema): for new_stream_idx, new_stream in enumerate(new_streams): new_tap_stream_id = new_stream['tap_stream_id'] - old_stream = next((item for item in old_streams if item['tap_stream_id'] == new_tap_stream_id), False) + old_stream = next( + ( + item + for item in old_streams + if item['tap_stream_id'] == new_tap_stream_id + ), + None, + ) # Is this a new stream? if not old_stream: @@ -436,54 +510,87 @@ def merge_schemas(self, old_schema, new_schema): new_stream_table_mdata_idx = 0 old_stream_table_mdata_idx = 0 try: - new_stream_table_mdata_idx = \ - [i for i, md in enumerate(new_stream['metadata']) if md['breadcrumb'] == []][0] - old_stream_table_mdata_idx = \ - [i for i, md in enumerate(old_stream['metadata']) if md['breadcrumb'] == []][0] + new_stream_table_mdata_idx = [ + i + for i, md in enumerate(new_stream['metadata']) + if md['breadcrumb'] == [] + ][0] + old_stream_table_mdata_idx = [ + i + for i, md in enumerate(old_stream['metadata']) + if md['breadcrumb'] == [] + ][0] except Exception: pass # Copy is-new flag from the old stream try: - new_schema['streams'][new_stream_idx]['is-new'] = old_stream['is-new'] + new_schema['streams'][new_stream_idx]['is-new'] = old_stream[ + 'is-new' + ] except Exception: pass # Copy selected from the old stream try: - new_schema['streams'][new_stream_idx]['metadata'][new_stream_table_mdata_idx]['metadata'][ - 'selected'] = old_stream['metadata'][old_stream_table_mdata_idx]['metadata']['selected'] + new_schema['streams'][new_stream_idx]['metadata'][ + new_stream_table_mdata_idx + ]['metadata']['selected'] = old_stream['metadata'][ + old_stream_table_mdata_idx + ][ + 'metadata' + ][ + 'selected' + ] except Exception: pass # Copy replication method from the old stream try: - new_schema['streams'][new_stream_idx]['metadata'] \ - [new_stream_table_mdata_idx]['metadata']['replication-method'] = \ - old_stream['metadata'][old_stream_table_mdata_idx]['metadata']['replication-method'] + new_schema['streams'][new_stream_idx]['metadata'][ + new_stream_table_mdata_idx + ]['metadata']['replication-method'] = old_stream['metadata'][ + old_stream_table_mdata_idx + ][ + 'metadata' + ][ + 'replication-method' + ] except Exception: pass # Copy replication key from the old stream try: - new_schema['streams'][new_stream_idx]['metadata'][new_stream_table_mdata_idx] \ - ['metadata']['replication-key'] = \ - old_stream['metadata'][old_stream_table_mdata_idx]['metadata'][ - 'replication-key'] + new_schema['streams'][new_stream_idx]['metadata'][ + new_stream_table_mdata_idx + ]['metadata']['replication-key'] = old_stream['metadata'][ + old_stream_table_mdata_idx + ][ + 'metadata' + ][ + 'replication-key' + ] except Exception: pass # Is this new or modified field? - new_fields = new_schema['streams'][new_stream_idx]['schema']['properties'] + new_fields = new_schema['streams'][new_stream_idx]['schema'][ + 'properties' + ] old_fields = old_stream['schema']['properties'] for new_field_key in new_fields: new_field = new_fields[new_field_key] new_field_mdata_idx = -1 # Find new field metadata index - for i, mdata in enumerate(new_schema['streams'][new_stream_idx]['metadata']): - if len(mdata['breadcrumb']) == 2 and mdata['breadcrumb'][0] == 'properties' and \ - mdata['breadcrumb'][1] == new_field_key: + for i, mdata in enumerate( + new_schema['streams'][new_stream_idx]['metadata'] + ): + if ( + len(mdata['breadcrumb']) == 2 + and mdata['breadcrumb'][0] == 'properties' + and mdata['breadcrumb'][1] == new_field_key + ): new_field_mdata_idx = i # Field exists @@ -493,13 +600,19 @@ def merge_schemas(self, old_schema, new_schema): # Find old field metadata index for i, mdata in enumerate(old_stream['metadata']): - if len(mdata['breadcrumb']) == 2 and mdata['breadcrumb'][0] == 'properties' and \ - mdata['breadcrumb'][1] == new_field_key: + if ( + len(mdata['breadcrumb']) == 2 + and mdata['breadcrumb'][0] == 'properties' + and mdata['breadcrumb'][1] == new_field_key + ): old_field_mdata_idx = i - new_mdata = new_schema['streams'][new_stream_idx]['metadata'][new_field_mdata_idx][ - 'metadata'] - old_mdata = old_stream['metadata'][old_field_mdata_idx]['metadata'] + new_mdata = new_schema['streams'][new_stream_idx][ + 'metadata' + ][new_field_mdata_idx]['metadata'] + old_mdata = old_stream['metadata'][old_field_mdata_idx][ + 'metadata' + ] # Copy is-new flag from the old properties try: @@ -521,28 +634,43 @@ def merge_schemas(self, old_schema, new_schema): # Field exists and type is the same - Do nothing more in the schema if new_field == old_field: - self.logger.debug('Field exists in %s stream with the same type: %s: %s', - new_tap_stream_id, new_field_key, new_field) + self.logger.debug( + 'Field exists in %s stream with the same type: %s: %s', + new_tap_stream_id, + new_field_key, + new_field, + ) # Field exists but types are different - Mark the field as modified in the metadata else: - self.logger.debug('Field exists in %s stream but types are different: %s: %s}', - new_tap_stream_id, new_field_key, new_field) + self.logger.debug( + 'Field exists in %s stream but types are different: %s: %s}', + new_tap_stream_id, + new_field_key, + new_field, + ) try: - new_schema['streams'][new_stream_idx]['metadata'][new_field_mdata_idx]['metadata'][ - 'is-modified'] = True - new_schema['streams'][new_stream_idx]['metadata'][new_field_mdata_idx]['metadata'][ - 'is-new'] = False + new_schema['streams'][new_stream_idx]['metadata'][ + new_field_mdata_idx + ]['metadata']['is-modified'] = True + new_schema['streams'][new_stream_idx]['metadata'][ + new_field_mdata_idx + ]['metadata']['is-new'] = False except Exception: pass # New field - Mark the field as new in the metadata else: - self.logger.debug('New field in stream %s: %s: %s', new_tap_stream_id, new_field_key, - new_field) + self.logger.debug( + 'New field in stream %s: %s: %s', + new_tap_stream_id, + new_field_key, + new_field, + ) try: - new_schema['streams'][new_stream_idx]['metadata'][new_field_mdata_idx]['metadata'][ - 'is-new'] = True + new_schema['streams'][new_stream_idx]['metadata'][ + new_field_mdata_idx + ]['metadata']['is-new'] = True except Exception: pass @@ -562,30 +690,54 @@ def make_default_selection(self, schema, selection_file): streams = schema['streams'] for stream_idx, stream in enumerate(streams): tap_stream_id = stream.get('tap_stream_id') - tap_stream_sel = False + tap_stream_sel = None for sel in selection: - if 'tap_stream_id' in sel and tap_stream_id.lower() == sel['tap_stream_id'].lower(): + if ( + 'tap_stream_id' in sel + and tap_stream_id.lower() == sel['tap_stream_id'].lower() + ): tap_stream_sel = sel # Find table specific metadata entries in the old and new streams try: - stream_table_mdata_idx = [i for i, md in enumerate(stream['metadata']) if md['breadcrumb'] == []][0] + stream_table_mdata_idx = [ + i + for i, md in enumerate(stream['metadata']) + if md['breadcrumb'] == [] + ][0] except Exception as exc: - raise Exception(f'Metadata of stream {tap_stream_id} doesn\'t have an empty breadcrumb') from exc + raise Exception( + f'Metadata of stream {tap_stream_id} doesn\'t have an empty breadcrumb' + ) from exc if tap_stream_sel: - self.logger.debug('Mark %s tap_stream_id as selected with properties %s', tap_stream_id, - tap_stream_sel) - schema['streams'][stream_idx]['metadata'][stream_table_mdata_idx]['metadata']['selected'] = True + self.logger.debug( + 'Mark %s tap_stream_id as selected with properties %s', + tap_stream_id, + tap_stream_sel, + ) + schema['streams'][stream_idx]['metadata'][stream_table_mdata_idx][ + 'metadata' + ]['selected'] = True if 'replication_method' in tap_stream_sel: - schema['streams'][stream_idx]['metadata'][stream_table_mdata_idx]['metadata'][ - 'replication-method'] = tap_stream_sel['replication_method'] + schema['streams'][stream_idx]['metadata'][ + stream_table_mdata_idx + ]['metadata']['replication-method'] = tap_stream_sel[ + 'replication_method' + ] if 'replication_key' in tap_stream_sel: - schema['streams'][stream_idx]['metadata'][stream_table_mdata_idx]['metadata'][ - 'replication-key'] = tap_stream_sel['replication_key'] + schema['streams'][stream_idx]['metadata'][ + stream_table_mdata_idx + ]['metadata']['replication-key'] = tap_stream_sel[ + 'replication_key' + ] else: - self.logger.debug('Mark %s tap_stream_id as not selected', tap_stream_id) - schema['streams'][stream_idx]['metadata'][stream_table_mdata_idx]['metadata']['selected'] = False + self.logger.debug( + 'Mark %s tap_stream_id as not selected', tap_stream_id + ) + schema['streams'][stream_idx]['metadata'][stream_table_mdata_idx][ + 'metadata' + ]['selected'] = False return schema @@ -599,7 +751,9 @@ def init(self): # Create project dir if not exists if os.path.exists(project_dir): - self.logger.error('Directory exists and cannot create new project: %s', self.args.name) + self.logger.error( + 'Directory exists and cannot create new project: %s', self.args.name + ) sys.exit(1) else: os.mkdir(project_dir) @@ -621,7 +775,13 @@ def test_tap_connection(self): target_id = self.target['id'] target_type = self.target['type'] - self.logger.info('Testing %s (%s) tap connection in %s (%s) target', tap_id, tap_type, target_id, target_type) + self.logger.info( + 'Testing %s (%s) tap connection in %s (%s) target', + tap_id, + tap_type, + target_id, + target_type, + ) # Generate and run the command to run the tap directly # We will use the discover option to test connection @@ -639,15 +799,21 @@ def test_tap_connection(self): returncode, new_schema, tap_output = result if returncode != 0: - self.logger.error('Testing tap connection (%s - %s) FAILED', target_id, tap_id) + self.logger.error( + 'Testing tap connection (%s - %s) FAILED', target_id, tap_id + ) sys.exit(1) # If the connection success then the response needs to be a valid JSON string if not utils.is_json(new_schema): - self.logger.error('Schema discovered by %s (%s) is not a valid JSON.', tap_id, tap_type) + self.logger.error( + 'Schema discovered by %s (%s) is not a valid JSON.', tap_id, tap_type + ) sys.exit(1) else: - self.logger.info('Testing tap connection (%s - %s) PASSED', target_id, tap_id) + self.logger.info( + 'Testing tap connection (%s - %s) PASSED', target_id, tap_id + ) # pylint: disable=too-many-locals,inconsistent-return-statements def discover_tap(self, tap=None, target=None): @@ -673,7 +839,13 @@ def discover_tap(self, tap=None, target=None): target_id = target.get('id') target_type = target.get('type') - self.logger.info('Discovering %s (%s) tap in %s (%s) target...', tap_id, tap_type, target_id, target_type) + self.logger.info( + 'Discovering %s (%s) tap in %s (%s) target...', + tap_id, + tap_type, + target_id, + target_type, + ) # Generate and run the command to run the tap directly command = f'{tap_bin} --config {tap_config_file} --discover' @@ -709,25 +881,33 @@ def discover_tap(self, tap=None, target=None): # Make selection from selection.json if exists try: - schema_with_diff = self.make_default_selection(schema_with_diff, tap_selection_file) + schema_with_diff = self.make_default_selection( + schema_with_diff, tap_selection_file + ) schema_with_diff = utils.delete_keys_from_dict( self.make_default_selection(schema_with_diff, tap_selection_file), - # Removing multipleOf json schema validations from properties.json, # that's causing run time issues - ['multipleOf']) + ['multipleOf'], + ) except Exception as exc: return f'Cannot load selection JSON at {tap_selection_file}. {str(exc)}' # Post import checks - post_import_errors = self._run_post_import_tap_checks(tap, schema_with_diff, target_id) + post_import_errors = self._run_post_import_tap_checks( + tap, schema_with_diff, target_id + ) if len(post_import_errors) > 0: - return f'Post import tap checks failed in tap {tap_id}: {post_import_errors}' + return ( + f'Post import tap checks failed in tap {tap_id}: {post_import_errors}' + ) # Save the new catalog into the tap try: - self.logger.info('Writing new properties file with changes into %s', tap_properties_file) + self.logger.info( + 'Writing new properties file with changes into %s', tap_properties_file + ) utils.save_json(schema_with_diff, tap_properties_file) except Exception as exc: return f'Cannot save file. {str(exc)}' @@ -739,11 +919,11 @@ def detect_tap_status(self, target_id, tap_id): self.logger.debug('Detecting %s tap status in %s target', tap_id, target_id) tap_dir = self.get_tap_dir(target_id, tap_id) log_dir = self.get_tap_log_dir(target_id, tap_id) - connector_files = self.get_connector_files(tap_dir) + connector_files = Config.get_connector_files(tap_dir) status = { 'currentStatus': 'unknown', 'lastStatus': 'unknown', - 'lastTimestamp': None + 'lastTimestamp': None, } # Tap exists but configuration not completed @@ -751,7 +931,10 @@ def detect_tap_status(self, target_id, tap_id): status['currentStatus'] = 'not-configured' # Tap exists and has log in running status - elif os.path.isdir(log_dir) and len(utils.search_files(log_dir, patterns=['*.log.running'])) > 0: + elif ( + os.path.isdir(log_dir) + and len(utils.search_files(log_dir, patterns=['*.log.running'])) > 0 + ): status['currentStatus'] = 'running' # Configured and not running @@ -760,7 +943,9 @@ def detect_tap_status(self, target_id, tap_id): # Get last run instance if os.path.isdir(log_dir): - log_files = utils.search_files(log_dir, patterns=['*.log.success', '*.log.failed'], sort=True) + log_files = utils.search_files( + log_dir, patterns=['*.log.success', '*.log.failed'], sort=True + ) if len(log_files) > 0: last_log_file = log_files[0] log_attr = utils.extract_log_attributes(last_log_file) @@ -783,7 +968,7 @@ def status(self): 'Enabled', 'Status', 'Last Sync', - 'Last Sync Result' + 'Last Sync Result', ] tab_body = [] pipelines = 0 @@ -791,44 +976,55 @@ def status(self): taps = self.get_taps(target['id']) for tap in taps: - tab_body.append([ - tap.get('id', ''), - tap.get('type', ''), - target.get('id', ''), - target.get('type', ''), - tap.get('enabled', ''), - tap.get('status', {}).get('currentStatus', ''), - tap.get('status', {}).get('lastTimestamp', ''), - tap.get('status', {}).get('lastStatus', '') - ]) + tab_body.append( + [ + tap.get('id', ''), + tap.get('type', ''), + target.get('id', ''), + target.get('type', ''), + tap.get('enabled', ''), + tap.get('status', {}).get('currentStatus', ''), + tap.get('status', {}).get('lastTimestamp', ''), + tap.get('status', {}).get('lastStatus', ''), + ] + ) pipelines += 1 print(tabulate(tab_body, headers=tab_headers, tablefmt='simple')) print(f'{pipelines} pipeline(s)') - def run_tap_singer(self, - tap: TapParams, - target: TargetParams, - transform: TransformParams, - stream_buffer_size: int = 0) -> str: + def run_tap_singer( + self, + tap: TapParams, + target: TargetParams, + transform: TransformParams, + stream_buffer_size: int = 0, + ) -> str: """ Generate and run piped shell command to sync tables using singer taps and targets """ # Build the piped executable command - command = commands.build_singer_command(tap=tap, - target=target, - transform=transform, - stream_buffer_size=stream_buffer_size, - stream_buffer_log_file=self.tap_run_log_file, - profiling_mode=self.profiling_mode, - profiling_dir=self.profiling_dir) + command = commands.build_singer_command( + tap=tap, + target=target, + transform=transform, + stream_buffer_size=stream_buffer_size, + stream_buffer_log_file=self.tap_run_log_file, + profiling_mode=self.profiling_mode, + profiling_dir=self.profiling_dir, + ) # Do not run if another instance is already running log_dir = os.path.dirname(self.tap_run_log_file) - if os.path.isdir(log_dir) and len(utils.search_files(log_dir, patterns=['*.log.running'])) > 0: + if ( + os.path.isdir(log_dir) + and len(utils.search_files(log_dir, patterns=['*.log.running'])) > 0 + ): self.logger.info( 'Failed to run. Another instance of the same tap is already running. ' - 'Log file detected in running status at %s', log_dir) + 'Log file detected in running status at %s', + log_dir, + ) sys.exit(1) start = None @@ -842,7 +1038,7 @@ def update_state_file(line: str) -> str: nonlocal start, state if start is None or time() - start >= 2: - with open(tap.state, 'w') as state_file: + with open(tap.state, 'w', encoding='utf-8') as state_file: state_file.write(line) # Update start time to be the current time. @@ -864,36 +1060,47 @@ def update_state_file_with_extra_log(line: str) -> str: # Run command with update_state_file as a callback to call for every stdout line if self.extra_log: - commands.run_command(command, self.tap_run_log_file, update_state_file_with_extra_log) + commands.run_command( + command, self.tap_run_log_file, update_state_file_with_extra_log + ) else: commands.run_command(command, self.tap_run_log_file, update_state_file) # update the state file one last time to make sure it always has the last state message. if state is not None: - with open(tap.state, 'w') as statefile: + with open(tap.state, 'w', encoding='utf-8') as statefile: statefile.write(state) - def run_tap_fastsync(self, tap: TapParams, target: TargetParams, transform: TransformParams): + def run_tap_fastsync( + self, tap: TapParams, target: TargetParams, transform: TransformParams + ): """ Generating and running shell command to sync tables using the native fastsync components """ # Build the fastsync executable command - command = commands.build_fastsync_command(tap=tap, - target=target, - transform=transform, - venv_dir=self.venv_dir, - temp_dir=self.get_temp_dir(), - tables=self.args.tables, - profiling_mode=self.profiling_mode, - profiling_dir=self.profiling_dir, - drop_pg_slot=self.drop_pg_slot) + command = commands.build_fastsync_command( + tap=tap, + target=target, + transform=transform, + venv_dir=self.venv_dir, + temp_dir=self.get_temp_dir(), + tables=self.args.tables, + profiling_mode=self.profiling_mode, + profiling_dir=self.profiling_dir, + drop_pg_slot=self.drop_pg_slot, + ) # Do not run if another instance is already running log_dir = os.path.dirname(self.tap_run_log_file) - if os.path.isdir(log_dir) and len(utils.search_files(log_dir, patterns=['*.log.running'])) > 0: + if ( + os.path.isdir(log_dir) + and len(utils.search_files(log_dir, patterns=['*.log.running'])) > 0 + ): self.logger.info( 'Failed to run. Another instance of the same tap is already running. ' - 'Log file detected in running status at %s', log_dir) + 'Log file detected in running status at %s', + log_dir, + ) sys.exit(1) # Fastsync is running in subprocess. @@ -905,7 +1112,9 @@ def add_fastsync_output_to_main_logger(line: str) -> str: if self.extra_log: # Run command and copy fastsync output to main logger - commands.run_command(command, self.tap_run_log_file, add_fastsync_output_to_main_logger) + commands.run_command( + command, self.tap_run_log_file, add_fastsync_output_to_main_logger + ) else: # Run command commands.run_command(command, self.tap_run_log_file) @@ -934,7 +1143,9 @@ def run_tap(self): tap_type = self.tap['type'] target_id = self.target['id'] target_type = self.target['type'] - stream_buffer_size = self.tap.get('stream_buffer_size', commands.DEFAULT_STREAM_BUFFER_SIZE) + stream_buffer_size = self.tap.get( + 'stream_buffer_size', commands.DEFAULT_STREAM_BUFFER_SIZE + ) self.logger.info('Running %s tap in %s target', tap_id, target_id) @@ -959,7 +1170,9 @@ def run_tap(self): # Some target attributes can be passed and override by tap (aka. inheritable config) # We merge the two configs and use that with the target - cons_target_config = self.create_consumable_target_config(target_config, tap_inheritable_config) + cons_target_config = self.create_consumable_target_config( + target_config, tap_inheritable_config + ) # Output will be redirected into target and tap specific log directory log_dir = self.get_tap_log_dir(target_id, tap_id) @@ -971,129 +1184,166 @@ def run_tap(self): tap_properties_fastsync, fastsync_stream_ids, tap_properties_singer, - singer_stream_ids + singer_stream_ids, ) = self.create_filtered_tap_properties( - target_type, - tap_type, + ConnectorType(target_type), + ConnectorType(tap_type), tap_properties, - tap_state, { + tap_state, + { 'selected': True, - 'target_type': ['target-snowflake', 'target-redshift', 'target-postgres', 'target-bigquery'], - 'tap_type': ['tap-mysql', 'tap-postgres', 'tap-s3-csv', 'tap-mongodb'], - 'initial_sync_required': True + 'tap_target_pairs': FASTSYNC_PAIRS, + 'initial_sync_required': True, }, - create_fallback=True) + create_fallback=True, + ) start_time = datetime.now() try: with pidfile.PIDFile(self.tap['files']['pidfile']): - target_params = TargetParams(id=target_id, - type=target_type, - bin=self.target_bin, - python_bin=self.target_python_bin, - config=cons_target_config) - - transform_params = TransformParams(bin=self.transform_field_bin, - python_bin=self.transform_field_python_bin, - config=tap_transformation, - tap_id=tap_id, - target_id=target_id) + target_params = TargetParams( + target_id=target_id, + type=target_type, + bin=self.target_bin, + python_bin=self.target_python_bin, + config=cons_target_config, + ) + + transform_params = TransformParams( + bin=self.transform_field_bin, + python_bin=self.transform_field_python_bin, + config=tap_transformation, + tap_id=tap_id, + target_id=target_id, + ) # Run fastsync for FULL_TABLE replication method if len(fastsync_stream_ids) > 0: - self.logger.info('Table(s) selected to sync by fastsync: %s', fastsync_stream_ids) - self.tap_run_log_file = os.path.join(log_dir, f'{target_id}-{tap_id}-{current_time}.fastsync.log') - tap_params = TapParams(id=tap_id, - type=tap_type, - bin=self.tap_bin, - python_bin=self.tap_python_bin, - config=tap_config, - properties=tap_properties_fastsync, - state=tap_state) - - self.run_tap_fastsync(tap=tap_params, - target=target_params, - transform=transform_params) + self.logger.info( + 'Table(s) selected to sync by fastsync: %s', fastsync_stream_ids + ) + self.tap_run_log_file = os.path.join( + log_dir, f'{target_id}-{tap_id}-{current_time}.fastsync.log' + ) + tap_params = TapParams( + tap_id=tap_id, + type=tap_type, + bin=self.tap_bin, + python_bin=self.tap_python_bin, + config=tap_config, + properties=tap_properties_fastsync, + state=tap_state, + ) + + self.run_tap_fastsync( + tap=tap_params, target=target_params, transform=transform_params + ) else: - self.logger.info('No table available that needs to be sync by fastsync') + self.logger.info( + 'No table available that needs to be sync by fastsync' + ) # Run singer tap for INCREMENTAL and LOG_BASED replication methods if len(singer_stream_ids) > 0: - self.logger.info('Table(s) selected to sync by singer: %s', singer_stream_ids) - self.tap_run_log_file = os.path.join(log_dir, f'{target_id}-{tap_id}-{current_time}.singer.log') - tap_params = TapParams(id=tap_id, - type=tap_type, - bin=self.tap_bin, - python_bin=self.tap_python_bin, - config=tap_config, - properties=tap_properties_singer, - state=tap_state) - - self.run_tap_singer(tap=tap_params, - target=target_params, - transform=transform_params, - stream_buffer_size=stream_buffer_size) + self.logger.info( + 'Table(s) selected to sync by singer: %s', singer_stream_ids + ) + self.tap_run_log_file = os.path.join( + log_dir, f'{target_id}-{tap_id}-{current_time}.singer.log' + ) + tap_params = TapParams( + tap_id=tap_id, + type=tap_type, + bin=self.tap_bin, + python_bin=self.tap_python_bin, + config=tap_config, + properties=tap_properties_singer, + state=tap_state, + ) + + self.run_tap_singer( + tap=tap_params, + target=target_params, + transform=transform_params, + stream_buffer_size=stream_buffer_size, + ) else: - self.logger.info('No table available that needs to be sync by singer') + self.logger.info( + 'No table available that needs to be sync by singer' + ) except pidfile.AlreadyRunningError: self.logger.error('Another instance of the tap is already running.') - utils.silentremove(cons_target_config) - utils.silentremove(tap_properties_fastsync) - utils.silentremove(tap_properties_singer) sys.exit(1) # Delete temp files if there is any except commands.RunCommandException as exc: self.logger.exception(exc) - utils.silentremove(cons_target_config) - utils.silentremove(tap_properties_fastsync) - utils.silentremove(tap_properties_singer) self._print_tap_run_summary(self.STATUS_FAILED, start_time, datetime.now()) self.send_alert(message=f'{tap_id} tap failed', exc=exc) sys.exit(1) except Exception as exc: - utils.silentremove(cons_target_config) - utils.silentremove(tap_properties_fastsync) - utils.silentremove(tap_properties_singer) self._print_tap_run_summary(self.STATUS_FAILED, start_time, datetime.now()) self.send_alert(message=f'{tap_id} tap failed', exc=exc) raise exc - - utils.silentremove(cons_target_config) - utils.silentremove(tap_properties_fastsync) - utils.silentremove(tap_properties_singer) + finally: + utils.silentremove(cons_target_config) + utils.silentremove(tap_properties_fastsync) + utils.silentremove(tap_properties_singer) self._print_tap_run_summary(self.STATUS_SUCCESS, start_time, datetime.now()) - def stop_tap(self): + # pylint: disable=unused-argument + def stop_tap(self, sig=None, frame=None): """ Stop running tap The command finds the tap specific pidfile that was created by run_tap command and sends - a SIGINT to the process. The SIGINT signal triggers _exit_gracefully function automatically and - the tap stops running. + a SIGTERM to the process. """ + self.logger.info('Trying to stop tap gracefully...') pidfile_path = self.tap['files']['pidfile'] try: - with open(pidfile_path) as pidf: + with open(pidfile_path, encoding='utf-8') as pidf: pid = int(pidf.read()) + pgid = os.getpgid(pid) parent = psutil.Process(pid) - # Terminate child processes + # Terminate all the processes in the current process' process group. for child in parent.children(recursive=True): - self.logger.info('Sending SIGINT to child pid %s...', child.pid) - child.send_signal(signal.SIGINT) + if os.getpgid(child.pid) == pgid: + self.logger.info('Sending SIGTERM to child pid %s...', child.pid) + child.terminate() + try: + child.wait(timeout=5) + except psutil.TimeoutExpired: + child.kill() - # Terminate main process - self.logger.info('Sending SIGINT to main pid %s...', parent.pid) - parent.send_signal(signal.SIGINT) except ProcessLookupError: - self.logger.error('Pid %s not found. Is the tap running on this machine? ' - 'Stopping taps remotely is not supported.', pid) + self.logger.error( + 'Pid %s not found. Is the tap running on this machine? ' + 'Stopping taps remotely is not supported.', + pid, + ) sys.exit(1) + except FileNotFoundError: - self.logger.error('No pidfile found at %s. Tap does not seem to be running.', pidfile_path) + self.logger.error( + 'No pidfile found at %s. Tap does not seem to be running.', pidfile_path + ) sys.exit(1) + # Remove pidfile. + os.remove(pidfile_path) + + # Rename log files from running to terminated status + if self.tap_run_log_file: + tap_run_log_file_running = f'{self.tap_run_log_file}.running' + tap_run_log_file_terminated = f'{self.tap_run_log_file}.terminated' + + if os.path.isfile(tap_run_log_file_running): + os.rename(tap_run_log_file_running, tap_run_log_file_terminated) + + sys.exit(1) + # pylint: disable=too-many-locals def sync_tables(self): """ @@ -1110,7 +1360,13 @@ def sync_tables(self): target_type = self.target['type'] fastsync_bin = utils.get_fastsync_bin(self.venv_dir, tap_type, target_type) - self.logger.info('Syncing tables from %s (%s) to %s (%s)...', tap_id, tap_type, target_id, target_type) + self.logger.info( + 'Syncing tables from %s (%s) to %s (%s)...', + tap_id, + tap_type, + target_id, + target_type, + ) # Run only if tap enabled if not self.tap.get('enabled', False): @@ -1120,16 +1376,23 @@ def sync_tables(self): # Run only if tap not running tap_status = self.detect_tap_status(target_id, tap_id) if tap_status['currentStatus'] == 'running': - self.logger.info('Tap %s is currently running and cannot sync. Stop the tap and try again.', - self.tap['name']) + self.logger.info( + 'Tap %s is currently running and cannot sync. Stop the tap and try again.', + self.tap['name'], + ) sys.exit(1) # Tap exists but configuration not completed if not os.path.isfile(fastsync_bin): - self.logger.error('Table sync function is not implemented from %s datasources to %s type of targets', - tap_type, target_type) + self.logger.error( + 'Table sync function is not implemented from %s datasources to %s type of targets', + tap_type, + target_type, + ) sys.exit(1) + self._cleanup_tap_state_file() + # Generate and run the command to run the tap directly tap_config = self.tap['files']['config'] tap_inheritable_config = self.tap['files']['inheritable_config'] @@ -1144,7 +1407,9 @@ def sync_tables(self): # Some target attributes can be passed and override by tap (aka. inheritable config) # We merge the two configs and use that with the target - cons_target_config = self.create_consumable_target_config(target_config, tap_inheritable_config) + cons_target_config = self.create_consumable_target_config( + target_config, tap_inheritable_config + ) # Output will be redirected into target and tap specific log directory log_dir = self.get_tap_log_dir(target_id, tap_id) @@ -1153,24 +1418,27 @@ def sync_tables(self): # sync_tables command always using fastsync try: with pidfile.PIDFile(self.tap['files']['pidfile']): - self.tap_run_log_file = os.path.join(log_dir, f'{target_id}-{tap_id}-{current_time}.fastsync.log') + self.tap_run_log_file = os.path.join( + log_dir, f'{target_id}-{tap_id}-{current_time}.fastsync.log' + ) # Create parameters as NamedTuples tap_params = TapParams( - id=tap_id, + tap_id=tap_id, type=tap_type, bin=self.tap_bin, python_bin=self.tap_python_bin, config=tap_config, properties=tap_properties, - state=tap_state) + state=tap_state, + ) target_params = TargetParams( - id=target_id, + target_id=target_id, type=target_type, bin=self.target_bin, python_bin=self.target_python_bin, - config=cons_target_config + config=cons_target_config, ) transform_params = TransformParams( @@ -1178,29 +1446,26 @@ def sync_tables(self): config=tap_transformation, python_bin=self.transform_field_python_bin, tap_id=tap_id, - target_id=target_id + target_id=target_id, ) - self.run_tap_fastsync(tap=tap_params, - target=target_params, - transform=transform_params) + self.run_tap_fastsync( + tap=tap_params, target=target_params, transform=transform_params + ) except pidfile.AlreadyRunningError: self.logger.error('Another instance of the tap is already running.') - utils.silentremove(cons_target_config) sys.exit(1) # Delete temp file if there is any except commands.RunCommandException as exc: self.logger.exception(exc) - utils.silentremove(cons_target_config) self.send_alert(message=f'Failed to sync tables in {tap_id} tap', exc=exc) sys.exit(1) except Exception as exc: - utils.silentremove(cons_target_config) self.send_alert(message=f'Failed to sync tables in {tap_id} tap', exc=exc) raise exc - - utils.silentremove(cons_target_config) + finally: + utils.silentremove(cons_target_config) def validate(self): """ @@ -1209,6 +1474,7 @@ def validate(self): yaml_dir = self.args.dir self.logger.info('Searching YAML config files in %s', yaml_dir) tap_yamls, target_yamls = utils.get_tap_target_names(yaml_dir) + self.logger.info('Detected taps: %s', tap_yamls) self.logger.info('Detected targets: %s', target_yamls) @@ -1217,37 +1483,74 @@ def validate(self): vault_secret = self.args.secret - target_ids = set() + # dictionary of targets ID and type + targets = {} + # Validate target json schemas and that no duplicate IDs exist for yaml_file in target_yamls: - self.logger.info('Started validating %s', yaml_file) - loaded_yaml = utils.load_yaml(os.path.join(yaml_dir, yaml_file), vault_secret) - utils.validate(loaded_yaml, target_schema) + self.logger.info('Started validating target file: %s', yaml_file) - if loaded_yaml['id'] in target_ids: - self.logger.error('Duplicate target found "%s"', loaded_yaml['id']) - sys.exit(1) + # pylint: disable=E1136 # False positive when loading vault encrypted YAML + target_yml = utils.load_yaml(os.path.join(yaml_dir, yaml_file), vault_secret) + utils.validate(target_yml, target_schema) - target_ids.add(loaded_yaml['id']) - self.logger.info('Finished validating %s', yaml_file) + if target_yml['id'] in targets: + raise DuplicateConfigException(f'Duplicate target found "{target_yml["id"]}"') + + targets[target_yml['id']] = target_yml['type'] + + self.logger.info('Finished validating target file: %s', yaml_file) tap_ids = set() + # Validate tap json schemas, check that every tap has valid 'target' and that no duplicate IDs exist for yaml_file in tap_yamls: - self.logger.info('Started validating %s', yaml_file) - loaded_yaml = utils.load_yaml(os.path.join(yaml_dir, yaml_file), vault_secret) - utils.validate(loaded_yaml, tap_schema) - - if loaded_yaml['id'] in tap_ids: - self.logger.error('Duplicate tap found "%s"', loaded_yaml['id']) - sys.exit(1) - - if loaded_yaml['target'] not in target_ids: - self.logger.error("Can'f find the target with the ID '%s' referenced in '%s'. Available target IDs: %s", - loaded_yaml['target'], yaml_file, target_ids) - sys.exit(1) + self.logger.info('Started validating %s ...', yaml_file) + + # pylint: disable=E1136 # False positive when loading vault encrypted YAML + tap_yml = utils.load_yaml(os.path.join(yaml_dir, yaml_file), vault_secret) + utils.validate(tap_yml, tap_schema) + + if tap_yml['id'] in tap_ids: + raise DuplicateConfigException(f'Duplicate tap found "{tap_yml["id"]}"') + + if tap_yml['target'] not in targets: + raise InvalidConfigException( + f"Can't find the target with the ID '{tap_yml['target']}' referenced in '{yaml_file}'." + f'Available target IDs: {list(targets.keys())}', + ) + + tap_ids.add(tap_yml['id']) + + # If there is a fastsync component for this tap-target combo and transformations on json properties are + # configured then fail the validation. + # The reason being that at the time of writing this, transformations in Fastsync are done on the + # target side using mostly SQL UPDATE, and transformations on properties in json fields are not + # implemented due to the need of converting XPATH syntax to SQL which has been deemed as not worth it + if self.__does_fastsync_component_exist(targets[tap_yml['target']], tap_yml['type']): + self.logger.debug('FastSync component found for tap %s', tap_yml['id']) + + # Load the transformations + transformations = Config.generate_transformations(tap_yml) + + # check if transformations are using "field_paths" or "field_path" config, fail if so + for transformation in transformations: + if transformation.get('field_paths') is not None: + raise InvalidTransformationException( + 'This tap-target combo has FastSync component and is configuring a transformation on json ' + 'properties which are not supported by FastSync!\n' + f'Please omit "field_paths" from the transformation config of tap "{tap_yml["id"]}"' + ) + + if transformation['when'] is not None: + for condition in transformation['when']: + if condition.get('field_path') is not None: + raise InvalidTransformationException( + 'This tap-target combo has FastSync component and is configuring a transformation ' + 'conditions on json properties which are not supported by FastSync!\n' + f'Please omit "field_path" from the transformation config of tap "{tap_yml["id"]}"' + ) - tap_ids.add(loaded_yaml['id']) self.logger.info('Finished validating %s', yaml_file) self.logger.info('Validation successful') @@ -1285,11 +1588,17 @@ def import_project(self): with parallel_backend('threading', n_jobs=-1): # Discover taps in parallel and return the list of exception of the failed ones - discover_excs.extend(list(filter(None, - Parallel(verbose=100)(delayed(self.discover_tap)( - tap=tap, - target=target - ) for tap in target.get('taps'))))) + discover_excs.extend( + list( + filter( + None, + Parallel(verbose=100)( + delayed(self.discover_tap)(tap=tap, target=target) + for tap in target.get('taps') + ), + ) + ) + ) # Log summary end_time = datetime.now() @@ -1310,7 +1619,7 @@ def import_project(self): total_taps, total_taps - len(discover_excs), str(discover_excs), - end_time - start_time + end_time - start_time, ) if len(discover_excs) > 0: sys.exit(1) @@ -1325,7 +1634,9 @@ def encrypt_string(self): print(yaml_text) print('Encryption successful') - def _is_initial_sync_required(self, replication_method: str, stream_bookmark: Dict) -> bool: + def _is_initial_sync_required( + self, replication_method: str, stream_bookmark: Dict + ) -> bool: """ Detects if a stream needs initial sync or not. Initial sync is required for INCREMENTAL and LOG_BASED tables @@ -1343,28 +1654,20 @@ def _is_initial_sync_required(self, replication_method: str, stream_bookmark: Di :param stream_bookmark: stream state bookmark :return: Boolean, True if needs initial sync, False otherwise """ - return replication_method == self.FULL_TABLE \ - or (replication_method == self.INCREMENTAL and - 'replication_key_value' not in stream_bookmark and - 'modified_since' not in stream_bookmark) \ - or (replication_method == self.LOG_BASED and - 'lsn' not in stream_bookmark and - 'log_pos' not in stream_bookmark and - 'token' not in stream_bookmark) - - # pylint: disable=unused-argument - def _exit_gracefully(self, sig, frame, exit_code=1): - self.logger.info('Stopping gracefully...') - - # Rename log files from running to terminated status - if self.tap_run_log_file: - tap_run_log_file_running = f'{self.tap_run_log_file}.running' - tap_run_log_file_terminated = f'{self.tap_run_log_file}.terminated' - - if os.path.isfile(tap_run_log_file_running): - os.rename(tap_run_log_file_running, tap_run_log_file_terminated) - - sys.exit(exit_code) + return ( + replication_method == self.FULL_TABLE + or ( + replication_method == self.INCREMENTAL + and 'replication_key_value' not in stream_bookmark + and 'modified_since' not in stream_bookmark + ) + or ( + replication_method == self.LOG_BASED + and 'lsn' not in stream_bookmark + and 'log_pos' not in stream_bookmark + and 'token' not in stream_bookmark + ) + ) def _print_tap_run_summary(self, status, start_time, end_time): summary = f""" @@ -1393,11 +1696,13 @@ def _print_tap_run_summary(self, status, start_time, end_time): # Append the summary to the right log file if log_file_to_write_summary: - with open(log_file_to_write_summary, 'a') as logfile: + with open(log_file_to_write_summary, 'a', encoding='utf-8') as logfile: logfile.write(summary) # pylint: disable=unused-variable - def _run_post_import_tap_checks(self, tap: Dict, catalog: Dict, target_id: str) -> List: + def _run_post_import_tap_checks( + self, tap: Dict, catalog: Dict, target_id: str + ) -> List: """ Run post import checks on a tap. @@ -1409,10 +1714,8 @@ def _run_post_import_tap_checks(self, tap: Dict, catalog: Dict, target_id: str) errors = [] error = self.__validate_transformations( - tap.get('files', {}).get('transformation'), - catalog, - tap['id'], - target_id) + tap.get('files', {}).get('transformation'), catalog, tap['id'], target_id + ) if error: errors.append(error) @@ -1436,19 +1739,50 @@ def _run_post_import_tap_checks(self, tap: Dict, catalog: Dict, target_id: str) primary_key_required = tap.get('primary_key_required', True) # Check if primary key is set for INCREMENTAL and LOG_BASED replications - if (selected and replication_method in [self.INCREMENTAL, self.LOG_BASED] and - len(table_key_properties) == 0 and primary_key_required): - errors.append(f'No primary key set for {tap_stream_id} stream ({replication_method})') + if ( + selected + and replication_method in [self.INCREMENTAL, self.LOG_BASED] + and len(table_key_properties) == 0 + and primary_key_required + ): + errors.append( + f'No primary key set for {tap_stream_id} stream ({replication_method})' + ) break return errors + def _cleanup_tap_state_file(self) -> None: + tables = self.args.tables + state_file = self.tap['files']['state'] + if tables: + self._clean_tables_from_bookmarks_in_state_file(state_file, tables) + else: + utils.silentremove(state_file) + + @staticmethod + def _clean_tables_from_bookmarks_in_state_file(state_file_to_clean: str, tables: str) -> None: + try: + with open(state_file_to_clean, 'r+', encoding='UTF-8') as state_file: + state_data = json.load(state_file) + bookmarks = state_data.get('bookmarks') + list_of_tables = tables.split(',') + if bookmarks: + for table_name in list_of_tables: + bookmarks.pop(table_name, None) + + state_file.seek(0) + json.dump(state_data, state_file) + state_file.truncate() + + except FileNotFoundError: + pass + except json.JSONDecodeError: + pass + def __validate_transformations( - self, - transformation_file: str, - catalog: Dict, - tap_id: str, - target_id: str) -> Optional[str]: + self, transformation_file: str, catalog: Dict, tap_id: str, target_id: str + ) -> Optional[str]: """ Run validation of transformation config Args: @@ -1463,9 +1797,9 @@ def __validate_transformations( # create a temp file with the content being the given catalog object # we need this file to execute the validation cli command - temp_catalog_file = utils.create_temp_file(dir=self.get_temp_dir(), - prefix='properties_', - suffix='.json')[1] + temp_catalog_file = utils.create_temp_file( + dir=self.get_temp_dir(), prefix='properties_', suffix='.json' + )[1] utils.save_json(catalog, temp_catalog_file) @@ -1474,7 +1808,9 @@ def __validate_transformations( """ if self.profiling_mode: - dump_file = os.path.join(self.profiling_dir, f'transformation_{tap_id}_{target_id}.pstat') + dump_file = os.path.join( + self.profiling_dir, f'transformation_{tap_id}_{target_id}.pstat' + ) command = f'{self.transform_field_python_bin} -m cProfile -o {dump_file} {command}' self.logger.debug('Transformation validation command: %s', command) @@ -1486,3 +1822,16 @@ def __validate_transformations( if returncode != 0: return stderr + + @classmethod + def __does_fastsync_component_exist(cls, target_type: str, tap_type: str) -> bool: + """ + Checks if the given tap-target combo have FastSync + Args: + target_type: type of the target + tap_type: type of tap + + Returns: + Boolean, True if FastSync exists, False otherwise. + """ + return ConnectorType(target_type) in FASTSYNC_PAIRS.get(ConnectorType(tap_type), {}) diff --git a/pipelinewise/cli/samples/tap_kafka.yml.sample b/pipelinewise/cli/samples/tap_kafka.yml.sample index 341d93164..8e156df5d 100644 --- a/pipelinewise/cli/samples/tap_kafka.yml.sample +++ b/pipelinewise/cli/samples/tap_kafka.yml.sample @@ -27,19 +27,34 @@ db_conn: primary_keys: transfer_id: "/transferMetadata/transferId" + #initial_start_time: # (Default: latest) Start time reference of the message consumption if + # no bookmarked position in state.json. One of: latest, earliest or an + # ISO-8601 formatted timestamp string. + # -------------------------------------------------------------------------- - # Kafka Consumer optional parameters + # Kafka Consumer optional parameters. Commented values are default values. # -------------------------------------------------------------------------- #max_runtime_ms: 300000 # The maximum time for the tap to collect new messages from Kafka topic. #consumer_timeout_ms: 10000 # KafkaConsumer setting. Number of milliseconds to block during message iteration before raising StopIteration #session_timeout_ms: 30000 # KafkaConsumer setting. The timeout used to detect failures when using Kafka’s group management facilities. #heartbeat_interval_ms: 10000 # KafkaConsumer setting. The expected time in milliseconds between heartbeats to the consumer coordinator when using Kafka’s group management facilities. #max_poll_interval_ms: 300000 # KafkaConsumer setting. The maximum delay between invocations of poll() when using consumer group management. - #max_poll_records: 500 # KafkaConsumer setting. The maximum number of records returned in a single call to poll(). #commit_interval_ms: 5000 # Number of milliseconds between two commits. This is different than the kafka auto commit feature. Tap-kafka sends commit messages automatically but only when the data consumed successfully and persisted to local store. - #local_store_dir: ./tap-kafka-local-store # Path to the local store with consumed kafka messages - #local_store_batch_size_rows: 1000 # Number of messages to write to disk in one go. This can avoid high I/O issues when messages written to local store disk too frequently. + + # -------------------------------------------------------------------------- + # Protobuf support - Experimental + # -------------------------------------------------------------------------- + #message_format: protobuf # (Default: json) Supported message formats are json and protobuf. + #proto_schema: | # Protobuf message format in .proto syntax. Required if the message_format is protobuf. + # syntax = "proto3"; + # + # message ProtoMessage { + # string query = 1; + # int32 page_number = 2; + # int32 result_per_page = 3; + # } + #proto_classess_dir: # (Default: current working dir) Directory where to store runtime compiled proto classes # ------------------------------------------------------------------------------ # Destination (Target) - Target properties diff --git a/pipelinewise/cli/schemas/tap.json b/pipelinewise/cli/schemas/tap.json index 18ccd1ca4..4bf5a5bea 100644 --- a/pipelinewise/cli/schemas/tap.json +++ b/pipelinewise/cli/schemas/tap.json @@ -38,13 +38,19 @@ "type": "object", "properties": { "column": { - "type": "string" + "type": "string", + "minLength": 1 + }, + "field_path": { + "type": "string", + "minLength": 1 }, "equals": { "type": ["null", "integer", "string", "boolean", "number"] } }, - "required": ["column", "equals"] + "required": ["column", "equals"], + "additionalProperties": false }, { "type": "object", @@ -54,6 +60,10 @@ }, "regex_match": { "type": "string" + }, + "field_path": { + "type": "string", + "minLength": 1 } }, "required": ["column", "regex_match"] @@ -122,9 +132,19 @@ }, "transformation": { "type": "object", + "additionalProperties": false, "properties": { "column": { - "type": "string" + "type": "string", + "minLength": 1 + }, + "field_paths": { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 }, "type": { "type": "string", @@ -142,7 +162,16 @@ "HASH-SKIP-FIRST-9", "MASK-DATE", "MASK-NUMBER", - "MASK-HIDDEN" + "MASK-HIDDEN", + "MASK-STRING-SKIP-ENDS-1", + "MASK-STRING-SKIP-ENDS-2", + "MASK-STRING-SKIP-ENDS-3", + "MASK-STRING-SKIP-ENDS-4", + "MASK-STRING-SKIP-ENDS-5", + "MASK-STRING-SKIP-ENDS-6", + "MASK-STRING-SKIP-ENDS-7", + "MASK-STRING-SKIP-ENDS-8", + "MASK-STRING-SKIP-ENDS-9" ] }, "when": { @@ -160,6 +189,7 @@ }, "s3_csv_mapping": { "type": "object", + "additionalProperties": false, "properties": { "search_prefix": { "type": "string" @@ -232,6 +262,67 @@ } }, "required": ["type"] + }, + "is_tap_github": { + "required": ["type"], + "properties": { + "type": { + "enum": ["tap-github"] + } + } + }, + "tap_github": { + "anyOf": [ + { + "not": { + "$ref": "#/definitions/is_tap_github" + } + }, + { + "required": ["db_conn"], + "properties": { + "db_conn": { + "type": "object", + "required": ["access_token", "start_date"], + "properties": { + "access_token": { + "type": "string" + }, + "start_date": { + "type": "string", + "format": "date-time" + }, + "organization": { + "type": "string" + }, + "repos_include": { + "type": "string" + }, + "repos_exclude": { + "type": "string" + }, + "repository": { + "type": "string" + }, + "include_archived": { + "type": "boolean", + "default": false + }, + "include_disabled": { + "type": "boolean", + "default": false + }, + "max_rate_limit_wait_seconds": { + "type": "integer", + "default": 600, + "minimum": 600, + "maximum": 3600 + } + } + } + } + } + ] } }, "type": "object", @@ -293,7 +384,7 @@ "split_file_max_chunks": { "type": "integer", "min": 1, - "max": 100 + "max": 99999 }, "schemas": { "type": "array", @@ -305,6 +396,9 @@ "allOf": [ { "$ref": "#/definitions/tap_mongo_implies_ft_and_lb" + }, + { + "$ref": "#/definitions/tap_github" } ], "required": [ diff --git a/pipelinewise/cli/tap_properties.py b/pipelinewise/cli/tap_properties.py index ae6b29619..12d42e2b7 100644 --- a/pipelinewise/cli/tap_properties.py +++ b/pipelinewise/cli/tap_properties.py @@ -105,31 +105,33 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{schema_name}}-{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-postgres': { 'tap_config_extras': { # Set tap_id to locate the corresponding replication slot - 'tap_id': tap['id'] if tap else None, + 'tap_id': tap['id'] + if tap + else None, }, 'tap_stream_id_pattern': '{{schema_name}}-{{table_name}}', 'tap_stream_name_pattern': '{{schema_name}}-{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-zuora': { 'tap_config_extras': { 'username': tap.get('db_conn', {}).get('username') if tap else None, 'password': tap.get('db_conn', {}).get('password') if tap else None, 'start_date': tap.get('db_conn', {}).get('start_date') if tap else None, - 'api_type': tap.get('db_conn', {}).get('api_type') if tap else None + 'api_type': tap.get('db_conn', {}).get('api_type') if tap else None, }, 'tap_stream_id_pattern': '{{table_name}}', 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'FULL_TABLE', - 'default_data_flattening_max_level': 10 + 'default_data_flattening_max_level': 10, }, 'tap-oracle': { 'tap_config_extras': {}, @@ -137,18 +139,15 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{schema_name}}-{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-kafka': { - 'tap_config_extras': { - 'local_store_dir': temp_dir, - 'encoding': 'utf-8' - }, + 'tap_config_extras': {'local_store_dir': temp_dir, 'encoding': 'utf-8'}, 'tap_stream_id_pattern': '{{table_name}}', 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-zendesk': { 'tap_config_extras': {}, @@ -156,7 +155,7 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 10 + 'default_data_flattening_max_level': 10, }, 'tap-adwords': { 'tap_config_extras': {}, @@ -164,27 +163,23 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-jira': { - 'tap_config_extras': { - 'user_agent': 'PipelineWise - Tap Jira' - }, + 'tap_config_extras': {'user_agent': 'PipelineWise - Tap Jira'}, 'tap_stream_id_pattern': '{{table_name}}', 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-s3-csv': { - 'tap_config_extras': { - 'tables': generate_tap_s3_csv_to_table_mappings(tap) - }, + 'tap_config_extras': {'tables': generate_tap_s3_csv_to_table_mappings(tap)}, 'tap_stream_id_pattern': '{{table_name}}', 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-snowflake': { 'tap_config_extras': { @@ -195,28 +190,26 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{schema_name}}-{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-salesforce': { - 'tap_config_extras': { - 'select_fields_by_default': True - }, + 'tap_config_extras': {'select_fields_by_default': True}, 'tap_stream_id_pattern': '{{table_name}}', 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 10 + 'default_data_flattening_max_level': 10, }, 'tap-mongodb': { 'tap_config_extras': { 'database': tap.get('db_conn', {}).get('dbname') if tap else None, - 'include_schemas_in_destination_stream_name': 'true' + 'include_schemas_in_destination_stream_name': 'true', }, 'tap_stream_id_pattern': '{{database_name}}-{{table_name}}', 'tap_stream_name_pattern': '{{database_name}}-{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-google-analytics': { 'tap_config_extras': {}, @@ -224,7 +217,7 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-github': { 'tap_config_extras': { @@ -236,7 +229,7 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--properties', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-shopify': { 'tap_config_extras': {}, @@ -244,7 +237,7 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-slack': { 'tap_config_extras': {}, @@ -252,19 +245,23 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-mixpanel': { 'tap_config_extras': { 'user_agent': 'PipelineWise - Tap Mixpanel', # Do not denest properties by default - 'denest_properties': tap.get('db_conn', {}).get('denest_properties', 'false') if tap else None + 'denest_properties': tap.get('db_conn', {}).get( + 'denest_properties', 'false' + ) + if tap + else None, }, 'tap_stream_id_pattern': '{{table_name}}', 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-twilio': { 'tap_config_extras': {}, @@ -272,7 +269,7 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'INCREMENTAL', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, 'tap-pardot': { 'tap_config_extras': {}, @@ -289,6 +286,6 @@ def get_tap_properties(tap=None, temp_dir=None): 'tap_stream_name_pattern': '{{schema_name}}-{{table_name}}', 'tap_catalog_argument': '--catalog', 'default_replication_method': 'LOG_BASED', - 'default_data_flattening_max_level': 0 + 'default_data_flattening_max_level': 0, }, } diff --git a/pipelinewise/cli/utils.py b/pipelinewise/cli/utils.py index e231fc6a2..b6a3eadc6 100644 --- a/pipelinewise/cli/utils.py +++ b/pipelinewise/cli/utils.py @@ -12,7 +12,6 @@ import sys import tempfile import warnings - import jsonschema import yaml @@ -23,14 +22,14 @@ from ansible.module_utils._text import to_text from ansible.module_utils.common._collections_compat import Mapping from ansible.parsing.dataloader import DataLoader -from ansible.parsing.vault import (VaultLib, get_file_vault_secret, is_encrypted_file) +from ansible.parsing.vault import VaultLib, get_file_vault_secret, is_encrypted_file from ansible.parsing.yaml.loader import AnsibleLoader -from ansible.parsing.yaml.objects import AnsibleVaultEncryptedUnicode +from ansible.parsing.yaml.objects import AnsibleMapping, AnsibleVaultEncryptedUnicode from . import tap_properties +from .errors import InvalidConfigException LOGGER = logging.getLogger(__name__) -ENV_VAR_PATTERN = re.compile(r'^env_var\(\'(.*)\'\)(.*)$') class AnsibleJSONEncoder(json.JSONEncoder): @@ -75,7 +74,7 @@ def is_json_file(path): """ try: if os.path.isfile(path): - with open(path) as jsonfile: + with open(path, encoding='utf-8') as jsonfile: if json.load(jsonfile): return True return False @@ -90,7 +89,7 @@ def load_json(path): try: LOGGER.debug('Parsing file at %s', path) if os.path.isfile(path): - with open(path) as jsonfile: + with open(path, encoding='utf-8') as jsonfile: return json.load(jsonfile) else: LOGGER.debug('No file at %s', path) @@ -116,8 +115,10 @@ def save_json(data, path): """ try: LOGGER.debug('Saving JSON %s', path) - with open(path, 'w') as jsonfile: - return json.dump(data, jsonfile, cls=AnsibleJSONEncoder, indent=4, sort_keys=True) + with open(path, 'w', encoding='utf-8') as jsonfile: + return json.dump( + data, jsonfile, cls=AnsibleJSONEncoder, indent=4, sort_keys=True + ) except Exception as exc: raise Exception(f'Cannot save JSON {path} {exc}') from exc @@ -139,7 +140,7 @@ def is_yaml_file(path): """ try: if os.path.isfile(path): - with open(path) as yamlfile: + with open(path, encoding='utf-8') as yamlfile: if yaml.safe_load(yamlfile): return True return False @@ -157,8 +158,12 @@ def get_tap_target_names(yaml_dir): (tap_yamls, target_yamls): tap_yamls is a list of names inside yaml_dir with "tap_*.y(a)ml" pattern. target_yamls is a list of names inside yaml_dir with "target_*.y(a)ml" pattern. """ - yamls = [f for f in os.listdir(yaml_dir) if os.path.isfile(os.path.join(yaml_dir, f)) - and (f.endswith('.yml') or f.endswith('.yaml'))] + yamls = [ + f + for f in os.listdir(yaml_dir) + if os.path.isfile(os.path.join(yaml_dir, f)) + and (f.endswith('.yml') or f.endswith('.yaml')) + ] target_yamls = set(filter(lambda y: y.startswith('target_'), yamls)) tap_yamls = set(filter(lambda y: y.startswith('tap_'), yamls)) @@ -181,14 +186,9 @@ def load_yaml(yaml_file, vault_secret=None): secret_file.load() vault.secrets = [('default', secret_file)] - # YAML ENV VAR - # name: !ENV env_var('FOO')/bar - yaml.add_implicit_resolver("!ENV", ENV_VAR_PATTERN) - yaml.add_constructor('!ENV', env_var_constructor) - data = None if os.path.isfile(yaml_file): - with open(yaml_file, 'r') as stream: + with open(yaml_file, 'r', encoding='utf-8') as stream: # Render environment variables using jinja templates contents = stream.read() template = Template(contents) @@ -196,27 +196,27 @@ def load_yaml(yaml_file, vault_secret=None): try: if is_encrypted_file(stream): file_data = stream.read() - data = yaml.load(vault.decrypt(file_data, None)) + data = yaml.safe_load(vault.decrypt(file_data, None)) else: - file_data = stream.read() - data = yaml.load(file_data, Loader=yaml.Loader) - - ''' - Commenting code below for posterity. We are not using ansible functionality but yaml load should - follow the same code path regardless of the file encryption state. - ''' - #loader = AnsibleLoader(stream, None, vault.secrets) - #try: - # data = loader.get_single_data() - #except Exception as exc: - # raise Exception(f'Error when loading YAML config at {yaml_file} {exc}') from exc - #finally: - # loader.dispose() + loader = AnsibleLoader(stream, None, vault.secrets) + try: + data = loader.get_single_data() + except Exception as exc: + raise Exception( + f'Error when loading YAML config at {yaml_file} {exc}' + ) from exc + finally: + loader.dispose() except yaml.YAMLError as exc: - raise Exception(f'Error when loading YAML config at {yaml_file} {exc}') from exc + raise Exception( + f'Error when loading YAML config at {yaml_file} {exc}' + ) from exc else: LOGGER.debug('No file at %s', yaml_file) + if isinstance(data, AnsibleMapping): + data = dict(data) + return data @@ -277,7 +277,9 @@ def get_sample_file_paths(): Get list of every available sample files (YAML, etc.) with absolute paths """ samples_dir = os.path.join(os.path.dirname(__file__), 'samples') - return search_files(samples_dir, patterns=['config.yml', '*.yml.sample', 'README.md'], abs_path=True) + return search_files( + samples_dir, patterns=['config.yml', '*.yml.sample', 'README.md'], abs_path=True + ) def validate(instance, schema): @@ -288,9 +290,8 @@ def validate(instance, schema): # Serialise vault encrypted objects to string schema_safe_inst = json.loads(json.dumps(instance, cls=AnsibleJSONEncoder)) jsonschema.validate(instance=schema_safe_inst, schema=schema) - except jsonschema.exceptions.ValidationError as exc: - LOGGER.critical('Invalid object %s', exc) - sys.exit(1) + except jsonschema.exceptions.ValidationError as ex: + raise InvalidConfigException(f'json object doesn\'t match schema {schema}') from ex def delete_empty_keys(dic): @@ -308,7 +309,12 @@ def delete_keys_from_dict(dic, keys): return dic if isinstance(dic, list): return [v for v in (delete_keys_from_dict(v, keys) for v in dic) if v] - return {k: v for k, v in ((k, delete_keys_from_dict(v, keys)) for k, v in dic.items()) if k not in keys} + # pylint: disable=C0325 # False positive on tuples + return { + k: v + for k, v in ((k, delete_keys_from_dict(v, keys)) for k, v in dic.items()) + if k not in keys + } def silentremove(path): @@ -336,7 +342,9 @@ def search_files(search_dir, patterns=None, sort=False, abs_path=False): # Search files and sort if required p_files = [] for pattern in patterns: - p_files.extend(filter(os.path.isfile, glob.glob(os.path.join(search_dir, pattern)))) + p_files.extend( + filter(os.path.isfile, glob.glob(os.path.join(search_dir, pattern))) + ) if sort: p_files.sort(key=os.path.getmtime, reverse=True) @@ -377,7 +385,7 @@ def extract_log_attributes(log_file): 'tap_id': tap_id, 'timestamp': timestamp, 'sync_engine': sync_engine, - 'status': status + 'status': status, } @@ -421,10 +429,11 @@ def get_tap_stream_id(tap, database_name, schema_name, table_name): """ pattern = get_tap_property(tap, 'tap_stream_id_pattern') - return pattern \ - .replace('{{database_name}}', f'{database_name}') \ - .replace('{{schema_name}}', f'{schema_name}') \ + return ( + pattern.replace('{{database_name}}', f'{database_name}') + .replace('{{schema_name}}', f'{schema_name}') .replace('{{table_name}}', f'{table_name}') + ) def get_tap_stream_name(tap, database_name, schema_name, table_name): @@ -437,10 +446,11 @@ def get_tap_stream_name(tap, database_name, schema_name, table_name): """ pattern = get_tap_property(tap, 'tap_stream_name_pattern') - return pattern \ - .replace('{{database_name}}', f'{database_name}') \ - .replace('{{schema_name}}', f'{schema_name}') \ + return ( + pattern.replace('{{database_name}}', f'{database_name}') + .replace('{{schema_name}}', f'{schema_name}') .replace('{{table_name}}', f'{table_name}') + ) def get_tap_default_replication_method(tap): @@ -482,13 +492,6 @@ def create_temp_file(suffix=None, prefix=None, dir=None, text=None): return tempfile.mkstemp(suffix, prefix, dir, text) -def env_var_constructor(loader, node): - value = loader.construct_scalar(node) - env_var, remaining_path = ENV_VAR_PATTERN.match(value).groups() - - return os.environ[env_var] + remaining_path - - def find_errors_in_log_file(file, max_errors=10, error_pattern=None): """ Find error lines in a log file @@ -514,7 +517,8 @@ def find_errors_in_log_file(file, max_errors=10, error_pattern=None): r'botocore\.exceptions\.|' # Generic python exceptions r'\.[E|e]xception|' - r'\.[E|e]rror') + r'\.[E|e]rror' + ) # Use known error patterns by default if not error_pattern: @@ -522,7 +526,7 @@ def find_errors_in_log_file(file, max_errors=10, error_pattern=None): errors = [] if file and os.path.isfile(file): - with open(file) as file_object: + with open(file, encoding='utf-8') as file_object: for line in file_object: if len(re.findall(error_pattern, line)) > 0: errors.append(line) @@ -549,5 +553,6 @@ def generate_random_string(length: int = 8) -> str: if 0 < length < 8: warnings.warn('Length is too small! consider 8 or more characters') - return ''.join(secrets.choice(string.ascii_uppercase + string.digits) - for _ in range(length)) + return ''.join( + secrets.choice(string.ascii_uppercase + string.digits) for _ in range(length) + ) diff --git a/pipelinewise/fastsync/README.md b/pipelinewise/fastsync/README.md index 7c7d650d9..28c44eeeb 100644 --- a/pipelinewise/fastsync/README.md +++ b/pipelinewise/fastsync/README.md @@ -13,17 +13,10 @@ components and uses it automatically whenever it’s possible. ## Supported tap-target routes -* MySQL to Snowflake. -* MySQL to Redshift -* MySQL to Postgres -* Postgres to Snowflake -* Postgres to Redshift -* Postgres to Postgres - -* S3 CSV to Snowflake -* S3 CSV to Redshift -* S3 CSV to Postgres - -* MongoDB to Snowflake -* MongoDB to Postgres +| Source | Destination | +|---|---| +| MySQL/MariaDB | * BigQuery
* Snowflake
* Postgres
* Redshift | +| Postgres | * BigQuery
* Snowflake
* Postgres
* Redshift | +| S3 CSV | * BigQuery
* Snowflake
* Postgres
* Redshift | +| MongoDB | * BigQuery
* Snowflake
* Postgres
| diff --git a/pipelinewise/fastsync/commons/errors.py b/pipelinewise/fastsync/commons/errors.py index 405ff4a6e..c21e69796 100644 --- a/pipelinewise/fastsync/commons/errors.py +++ b/pipelinewise/fastsync/commons/errors.py @@ -1,8 +1,14 @@ class ExportError(Exception): """Raised when export fails""" + class TableNotFoundError(Exception): """Raised when configured table doesn't exist in source""" + class MongoDBInvalidDatetimeError(Exception): """Raised when a bson datetime is invalid and cannot be serialized""" + + +class UnsupportedKeyTypeException(Exception): + """Raised if key type is unsupported""" diff --git a/pipelinewise/fastsync/commons/split_gzip.py b/pipelinewise/fastsync/commons/split_gzip.py index 9905d5eb9..f6b557982 100644 --- a/pipelinewise/fastsync/commons/split_gzip.py +++ b/pipelinewise/fastsync/commons/split_gzip.py @@ -16,7 +16,14 @@ # pylint: disable=W0622,R1732 -def open(base_filename, mode='wb', chunk_size_mb=None, max_chunks=None, est_compr_rate=None, compress=True): +def open( + base_filename, + mode='wb', + chunk_size_mb=None, + max_chunks=None, + est_compr_rate=None, + compress=True, +): """Open a gzip-compressed file in binary or text mode. Args: @@ -40,7 +47,9 @@ def open(base_filename, mode='wb', chunk_size_mb=None, max_chunks=None, est_comp raise ValueError('Invalid chunk_size_mb: %d' % (chunk_size_mb,)) if max_chunks is not None and max_chunks < 0: raise ValueError('Invalid max_chunks: %d' % (max_chunks,)) - return SplitGzipFile(base_filename, mode, chunk_size_mb, max_chunks, est_compr_rate, compress) + return SplitGzipFile( + base_filename, mode, chunk_size_mb, max_chunks, est_compr_rate, compress + ) # pylint: disable=R0902 @@ -49,22 +58,27 @@ class SplitGzipFile(io.BufferedIOBase): This class only supports writing files in binary mode. """ - def __init__(self, - base_filename, - mode: str = None, - chunk_size_mb: int = None, - max_chunks: int = None, - est_compr_rate: float = None, - compress=True): + + def __init__( + self, + base_filename, + mode: str = None, + chunk_size_mb: int = None, + max_chunks: int = None, + est_compr_rate: float = None, + compress=True, + ): super().__init__() self.base_filename = base_filename self.mode = mode self.chunk_size_mb = chunk_size_mb or DEFAULT_CHUNK_SIZE_MB self.max_chunks = max_chunks if max_chunks is not None else DEFAULT_MAX_CHUNKS - self.compress= compress + self.compress = compress if compress: - self.est_compr_rate = est_compr_rate if est_compr_rate is not None else EST_COMPR_RATE + self.est_compr_rate = ( + est_compr_rate if est_compr_rate is not None else EST_COMPR_RATE + ) else: self.est_compr_rate = 1.0 self.chunk_seq = 1 @@ -85,7 +99,10 @@ def _gen_chunk_filename(self) -> str: if self.max_chunks == 0: chunk_filename = self.base_filename else: - if self.current_chunk_size_mb >= self.chunk_size_mb and self.chunk_seq < self.max_chunks: + if ( + self.current_chunk_size_mb >= self.chunk_size_mb + and self.chunk_seq < self.max_chunks + ): # Increase the chunk sequence and reset size to zero self.chunk_seq += 1 self.current_chunk_size_mb = 0 @@ -109,7 +126,14 @@ def _activate_chunk_file(self): if self.compress: self.chunk_file = gzip.open(self.chunk_filename, self.mode) else: - self.chunk_file = builtins.open(self.chunk_filename, self.mode) + if 'b' in self.mode: + self.chunk_file = builtins.open( # pylint: disable=unspecified-encoding + self.chunk_filename, self.mode + ) + else: + self.chunk_file = builtins.open( + self.chunk_filename, self.mode, encoding='utf-8' + ) @staticmethod def _bytes_to_megabytes(size: int) -> float: @@ -136,7 +160,9 @@ def write(self, _bytes): self._activate_chunk_file() self.chunk_file.write(_bytes) - self.current_chunk_size_mb = SplitGzipFile._bytes_to_megabytes(self.chunk_file.tell() * self.est_compr_rate) + self.current_chunk_size_mb = SplitGzipFile._bytes_to_megabytes( + self.chunk_file.tell() * self.est_compr_rate + ) def close(self): """ diff --git a/pipelinewise/fastsync/commons/tap_mongodb.py b/pipelinewise/fastsync/commons/tap_mongodb.py index 5b908b144..1a6c951a8 100644 --- a/pipelinewise/fastsync/commons/tap_mongodb.py +++ b/pipelinewise/fastsync/commons/tap_mongodb.py @@ -2,97 +2,199 @@ import csv import datetime import gzip -import json +import ujson import logging import os -import ssl import subprocess import uuid import bson import pytz import tzlocal +from urllib import parse -from typing import Tuple, Optional, Dict, Callable +from typing import Tuple, Optional, Dict, Callable, Any from pymongo import MongoClient from pymongo.database import Database from singer.utils import strftime as singer_strftime from . import utils, split_gzip -from .errors import ExportError, TableNotFoundError, MongoDBInvalidDatetimeError +from .errors import ( + ExportError, + TableNotFoundError, + MongoDBInvalidDatetimeError, + UnsupportedKeyTypeException, +) LOGGER = logging.getLogger(__name__) DEFAULT_WRITE_BATCH_ROWS = 50000 -class MongoDBJsonEncoder(json.JSONEncoder): +def serialize_document(document: Dict) -> Dict: """ - Custom JSON encoder to be used to serialize data from MongoDB + serialize mongodb Document into a json object + + Args: + document: MongoDB document + + Returns: Dict """ - @staticmethod - def _serialize_datetime(val): - """ - Serialize Bson and python datetime types - Args: - val: datetime value + return { + key: transform_value(val, [key]) + for key, val in document.items() + if not isinstance(val, (bson.min_key.MinKey, bson.max_key.MaxKey)) + } - Returns: serialized datetime value - """ - if isinstance(val, bson.datetime.datetime): - timezone = tzlocal.get_localzone() - try: - local_datetime = timezone.localize(val) - utc_datetime = local_datetime.astimezone(pytz.UTC) - except Exception as exc: - if str(exc) == 'year is out of range' and val.year == 0: - # NB: Since datetimes are persisted as strings, it doesn't - # make sense to blow up on invalid Python datetimes (e.g., - # year=0). In this case we're formatting it as a string and - # passing it along down the pipeline. - return '{:04d}-{:02d}-{:02d}T{:02d}:{:02d}:{:02d}.{:06d}Z'.format(val.year, - val.month, - val.day, - val.hour, - val.minute, - val.second, - val.microsecond) - raise MongoDBInvalidDatetimeError('Found invalid datetime {}'.format(val)) from exc - - return singer_strftime(utc_datetime) - - if isinstance(val, datetime.datetime): +def class_to_string(key_value: Any, key_type: str) -> str: + """ + Converts specific types to string equivalent + The supported types are: datetime, bson Timestamp, bytes, int, Int64, float, ObjectId, str and UUID + Args: + key_value: The value to convert to string + key_type: the value type + + Returns: string equivalent of key value + Raises: UnsupportedKeyTypeException if key_type is not supported + """ + if key_type == 'datetime': + if key_value.tzinfo is None: timezone = tzlocal.get_localzone() - local_datetime = timezone.localize(val) + local_datetime = timezone.localize(key_value) utc_datetime = local_datetime.astimezone(pytz.UTC) - return singer_strftime(utc_datetime) - return None + else: + utc_datetime = key_value.astimezone(pytz.UTC) - def default(self, o): # false positive complaint -> pylint: disable=E0202 - """ - Custom function to serialize several sort of BSON and Python types - Args: - obj: Object to serialize + return singer_strftime(utc_datetime) - Returns: Serialized value - """ - encoding_map = { - bson.objectid.ObjectId: str, - uuid.UUID: str, - bson.int64.Int64: str, - bson.timestamp.Timestamp: lambda value: singer_strftime(value.as_datetime()), - bytes: lambda value: base64.b64encode(value).decode('utf-8'), - bson.decimal128.Decimal128: lambda val: val.to_decimal(), - bson.regex.Regex: lambda val: dict(pattern=val.pattern, flags=val.flags), - bson.code.Code: lambda val: dict(value=str(val), scope=str(val.scope)) if val.scope else str(val), - bson.dbref.DBRef: lambda val: dict(id=str(val.id), collection=val.collection, database=val.database), - datetime.datetime: self._serialize_datetime, - bson.datetime.datetime: self._serialize_datetime - } - - if o.__class__ in encoding_map: - return encoding_map[o.__class__](o) - - return super().default(o) + if key_type == 'Timestamp': + return '{}.{}'.format(key_value.time, key_value.inc) + + if key_type == 'bytes': + return base64.b64encode(key_value).decode('utf-8') + + if key_type in ['int', 'Int64', 'float', 'ObjectId', 'str', 'UUID']: + return str(key_value) + + raise UnsupportedKeyTypeException('{} is not a supported key type'.format(key_type)) + + +def safe_transform_datetime(value: datetime.datetime, path) -> str: + """ + Safely transform datetime from local tz to UTC if applicable + Args: + value: datetime value to transform + path: + + Returns: utc datetime as string + + """ + timezone = tzlocal.get_localzone() + try: + local_datetime = timezone.localize(value) + utc_datetime = local_datetime.astimezone(pytz.UTC) + except Exception as ex: + if str(ex) == 'year is out of range' and value.year == 0: + # NB: Since datetimes are persisted as strings, it doesn't + # make sense to blow up on invalid Python datetimes (e.g., + # year=0). In this case we're formatting it as a string and + # passing it along down the pipeline. + return '{:04d}-{:02d}-{:02d}T{:02d}:{:02d}:{:02d}.{:06d}Z'.format( + value.year, + value.month, + value.day, + value.hour, + value.minute, + value.second, + value.microsecond, + ) + raise MongoDBInvalidDatetimeError( + 'Found invalid datetime at [{}]: {}'.format('.'.join(map(str, path)), value) + ) from ex + return singer_strftime(utc_datetime) + + +def transform_value(value: Any, path) -> Any: + """ + transform values to json friendly ones + Args: + value: value to transform + path: + + Returns: transformed value + + """ + conversion = { + list: lambda val, pat: list( + map(lambda v: transform_value(v[1], pat + [v[0]]), enumerate(val)) + ), + dict: lambda val, pat: { + k: transform_value(v, pat + [k]) for k, v in val.items() + }, + uuid.UUID: lambda val, _: class_to_string(val, 'UUID'), + bson.objectid.ObjectId: lambda val, _: class_to_string(val, 'ObjectId'), + bson.datetime.datetime: safe_transform_datetime, + bson.timestamp.Timestamp: lambda val, _: singer_strftime(val.as_datetime()), + bson.int64.Int64: lambda val, _: class_to_string(val, 'Int64'), + bytes: lambda val, _: class_to_string(val, 'bytes'), + datetime.datetime: lambda val, _: class_to_string(val, 'datetime'), + bson.decimal128.Decimal128: lambda val, _: val.to_decimal(), + bson.regex.Regex: lambda val, _: dict(pattern=val.pattern, flags=val.flags), + bson.code.Code: lambda val, _: dict(value=str(val), scope=str(val.scope)) + if val.scope + else str(val), + bson.dbref.DBRef: lambda val, _: dict( + id=str(val.id), collection=val.collection, database=val.database + ), + } + + if isinstance(value, tuple(conversion.keys())): + return conversion[type(value)](value, path) + + return value + + +def get_connection_string(config: Dict): + """ + Generates a MongoClientConnectionString based on configuration + Args: + config: DB config + + Returns: A MongoClient connection string + """ + srv = config.get('srv') == 'true' + + # Default SSL verify mode to true, give option to disable + verify_mode = config.get('verify_mode', 'true') == 'true' + use_ssl = config.get('ssl') == 'true' + + connection_query = { + 'readPreference': 'secondaryPreferred', + 'authSource': config['auth_database'], + } + + if config.get('replica_set'): + connection_query['replicaSet'] = config['replica_set'] + + if use_ssl: + connection_query['ssl'] = 'true' + + # NB: "sslAllowInvalidCertificates" must ONLY be supplied if `SSL` is true. + if not verify_mode and use_ssl: + connection_query['tlsAllowInvalidCertificates'] = 'true' + + query_string = parse.urlencode(connection_query) + + connection_string = '{protocol}://{user}:{password}@{host}{port}/{database}?{query_string}'.format( + protocol='mongodb+srv' if srv else 'mongodb', + user=config['user'], + password=config['password'], + host=config['host'], + port='' if srv else ':{port}'.format(port=int(config['port'])), + database=config['database'], + query_string=query_string + ) + + return connection_string class FastSyncTapMongoDB: @@ -108,8 +210,11 @@ def __init__(self, connection_config: Dict, tap_type_to_target_type: Callable): tap_type_to_target_type: Function that maps tap types to target ones """ self.connection_config = connection_config - self.connection_config['write_batch_rows'] = connection_config.get('write_batch_rows', - DEFAULT_WRITE_BATCH_ROWS) + self.connection_config['write_batch_rows'] = connection_config.get( + 'write_batch_rows', DEFAULT_WRITE_BATCH_ROWS + ) + + self.connection_config['connection_string'] = get_connection_string(self.connection_config) self.tap_type_to_target_type = tap_type_to_target_type self.database: Optional[Database] = None @@ -118,21 +223,10 @@ def open_connection(self): """ Open connection """ - # Default SSL verify mode to true, give option to disable - verify_mode = self.connection_config.get('verify_mode', 'true') == 'true' - use_ssl = self.connection_config.get('ssl') == 'true' - - connection_params = dict(host=self.connection_config['host'], port=int(self.connection_config['port']), - username=self.connection_config['user'], password=self.connection_config['password'], - authSource=self.connection_config['auth_database'], ssl=use_ssl, - replicaSet=self.connection_config.get('replica_set', None), - readPreference='secondaryPreferred') - # NB: "ssl_cert_reqs" must ONLY be supplied if `SSL` is true. - if not verify_mode and use_ssl: - connection_params['ssl_cert_reqs'] = ssl.CERT_NONE - - self.database = MongoClient(**connection_params)[self.connection_config['database']] + self.database = MongoClient(self.connection_config['connection_string'])[ + self.connection_config['database'] + ] def close_connection(self): """ @@ -141,15 +235,16 @@ def close_connection(self): self.database.client.close() # pylint: disable=R0914,R0913 - def copy_table(self, - table_name: str, - filepath: str, - temp_dir: str, - split_large_files=False, - split_file_chunk_size_mb=1000, - split_file_max_chunks=20, - compress=True - ): + def copy_table( + self, + table_name: str, + filepath: str, + temp_dir: str, + split_large_files=False, + split_file_chunk_size_mb=1000, + split_file_max_chunks=20, + compress=True, + ): """ Export data from table to a zipped csv Args: @@ -160,6 +255,7 @@ def copy_table(self, with -partXYZ postfix in the filename. (Default: False) split_file_chunk_size_mb: File chunk sizes if `split_large_files` enabled. (Default: 1000) split_file_max_chunks: Max number of chunks if `split_large_files` enabled. (Default: 20) + compress: Flag to indicate whether to compress export files """ table_dict = utils.tablename_to_dict(table_name, '.') @@ -173,17 +269,23 @@ def copy_table(self, exported_rows = 0 try: - gzip_splitter = split_gzip.open(filepath, - mode='wt', - chunk_size_mb=split_file_chunk_size_mb, - max_chunks=split_file_max_chunks if split_large_files else 0, - compress=compress) - with gzip.open(export_file_path, 'rb') as export_file, gzip_splitter as gzfile: - writer = csv.DictWriter(gzfile, - fieldnames=[elem[0] for elem in self._get_collection_columns()], - delimiter=',', - quotechar='"', - quoting=csv.QUOTE_MINIMAL) + gzip_splitter = split_gzip.open( + filepath, + mode='wt', + chunk_size_mb=split_file_chunk_size_mb, + max_chunks=split_file_max_chunks if split_large_files else 0, + compress=compress, + ) + with gzip.open( + export_file_path, 'rb' + ) as export_file, gzip_splitter as gzfile: + writer = csv.DictWriter( + gzfile, + fieldnames=[elem[0] for elem in self._get_collection_columns()], + delimiter=',', + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + ) writer.writeheader() rows = [] @@ -192,13 +294,24 @@ def copy_table(self, # bson.decode_file_iter will generate one document at a time from the exported file for document in bson.decode_file_iter(export_file): - rows.append({ - '_ID': str(document['_id']), - 'DOCUMENT': json.dumps(document, cls=MongoDBJsonEncoder, separators=(',', ':')), - utils.SDC_EXTRACTED_AT: extracted_at, - utils.SDC_BATCHED_AT: datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f'), - utils.SDC_DELETED_AT: None - }) + try: + rows.append( + { + '_ID': str(document['_id']), + 'DOCUMENT': ujson.dumps(serialize_document(document)), + utils.SDC_EXTRACTED_AT: extracted_at, + utils.SDC_BATCHED_AT: datetime.datetime.utcnow().strftime( + '%Y-%m-%d %H:%M:%S.%f' + ), + utils.SDC_DELETED_AT: None, + } + ) + except TypeError: + LOGGER.error( + 'TypeError encountered when processing document ID: %s', + document['_id'], + ) + raise exported_rows += 1 @@ -207,7 +320,8 @@ def copy_table(self, LOGGER.info( 'Exporting batch from %s to %s rows from %s...', (exported_rows - write_batch_rows), - exported_rows, table_name + exported_rows, + table_name, ) writer.writerows(rows) @@ -239,7 +353,7 @@ def _get_collection_columns() -> Tuple: (utils.SDC_DELETED_AT, 'string'), ) - def fetch_current_log_pos(self)->Dict: + def fetch_current_log_pos(self) -> Dict: """ Find and returns the latest ChangeStream token. LOG_BASED method uses changes streams. @@ -268,14 +382,12 @@ def fetch_current_log_pos(self)->Dict: # token can contain a property '_typeBits' of type bytes which cannot be json # serialized when saving the state in the function 'utils.save_state_file'. # '_data' is enough to resume LOG_BASED Singer replication after FastSync - return { - 'token': { - '_data': token['_data'] - } - } + return {'token': {'_data': token['_data']}} # pylint: disable=invalid-name - def fetch_current_incremental_key_pos(self, fully_qualified_table_name: str, replication_key: str): + def fetch_current_incremental_key_pos( + self, fully_qualified_table_name: str, replication_key: str + ): """ No Implemented Args: @@ -293,14 +405,13 @@ def map_column_types_to_target(self): mapped_columns = [] for column_name, column_type in self._get_collection_columns(): - mapped_columns.append(f'{column_name} {self.tap_type_to_target_type(column_type)}') + mapped_columns.append( + f'{column_name} {self.tap_type_to_target_type(column_type)}' + ) - return { - 'columns': mapped_columns, - 'primary_key': ['_ID'] - } + return {'columns': mapped_columns, 'primary_key': ['_ID']} - def _export_collection(self, export_dir: str, collection_name)->str: + def _export_collection(self, export_dir: str, collection_name) -> str: """ Dump a collection data into a compressed bson file and returns the path Args: @@ -312,32 +423,34 @@ def _export_collection(self, export_dir: str, collection_name)->str: """ LOGGER.info('Starting export of table "%s"', collection_name) - url = f'mongodb://{self.connection_config["user"]}:{self.connection_config["password"]}' \ - f'@{self.connection_config["host"]}:{self.connection_config["port"]}/' \ - f'{self.connection_config["database"]}?authSource={self.connection_config["auth_database"]}' \ - f'&readPreference=secondaryPreferred' - - if self.connection_config.get('replica_set', None) is not None: - url += f'&replicaSet={self.connection_config["replica_set"]}' - - if self.connection_config.get('ssl', None) is not None: - url += f'&ssl={self.connection_config["ssl"]}' - - return_code = subprocess.call([ + cmd = [ 'mongodump', - '--uri', f'"{url}"', + '--uri', + f'"{self.connection_config["connection_string"]}"', '--forceTableScan', '--gzip', - '-c', collection_name, - '-o', export_dir - ]) + '-c', + collection_name, + '-o', + export_dir, + ] + + return_code = subprocess.call(cmd) LOGGER.debug('Export command return code %s', return_code) if return_code != 0: raise ExportError(f'Export failed with code {return_code}') - #mongodump creates two files "{collection_name}.metadata.json.gz" & "{collection_name}.bson.gz" + # mongodump creates two files "{collection_name}.metadata.json.gz" & "{collection_name}.bson.gz" # we are only interested in the latter so we delete the former. - os.remove(os.path.join(export_dir, self.connection_config['database'], f'{collection_name}.metadata.json.gz')) - return os.path.join(export_dir, self.connection_config['database'], f'{collection_name}.bson.gz') + os.remove( + os.path.join( + export_dir, + self.connection_config['database'], + f'{collection_name}.metadata.json.gz', + ) + ) + return os.path.join( + export_dir, self.connection_config['database'], f'{collection_name}.bson.gz' + ) diff --git a/pipelinewise/fastsync/commons/tap_mysql.py b/pipelinewise/fastsync/commons/tap_mysql.py index ac9628646..f8c32f16d 100644 --- a/pipelinewise/fastsync/commons/tap_mysql.py +++ b/pipelinewise/fastsync/commons/tap_mysql.py @@ -2,21 +2,24 @@ import datetime import decimal import logging -import pymysql +from typing import Tuple +import pymysql from pymysql import InterfaceError, OperationalError -from . import utils, split_gzip from ...utils import safe_column_name +from . import split_gzip, utils LOGGER = logging.getLogger(__name__) DEFAULT_CHARSET = 'utf8' DEFAULT_EXPORT_BATCH_ROWS = 50000 -DEFAULT_SESSION_SQLS = ['SET @@session.time_zone="+0:00"', - 'SET @@session.wait_timeout=28800', - 'SET @@session.net_read_timeout=3600', - 'SET @@session.innodb_lock_wait_timeout=3600'] +DEFAULT_SESSION_SQLS = [ + 'SET @@session.time_zone="+0:00"', + 'SET @@session.wait_timeout=28800', + 'SET @@session.net_read_timeout=3600', + 'SET @@session.innodb_lock_wait_timeout=3600', +] class FastSyncTapMySql: @@ -24,45 +27,76 @@ class FastSyncTapMySql: Common functions for fastsync from a MySQL database """ - def __init__(self, connection_config, tap_type_to_target_type, target_quote=None): + def __init__(self, connection_config: dict, tap_type_to_target_type, target_quote=None): self.connection_config = connection_config - self.connection_config['charset'] = connection_config.get('charset', DEFAULT_CHARSET) - self.connection_config['export_batch_rows'] = connection_config.get('export_batch_rows', - DEFAULT_EXPORT_BATCH_ROWS) - self.connection_config['session_sqls'] = connection_config.get('session_sqls', DEFAULT_SESSION_SQLS) + self.connection_config['charset'] = connection_config.get( + 'charset', DEFAULT_CHARSET + ) + self.connection_config['export_batch_rows'] = connection_config.get( + 'export_batch_rows', DEFAULT_EXPORT_BATCH_ROWS + ) + self.connection_config['session_sqls'] = connection_config.get( + 'session_sqls', DEFAULT_SESSION_SQLS + ) self.tap_type_to_target_type = tap_type_to_target_type self.target_quote = target_quote self.conn = None self.conn_unbuffered = None + self.is_replica = False + + def get_connection_parameters(self) -> Tuple[dict, bool]: + """ + Method to get connection parameters + Connection is either to the primary or a replica if its credentials are given + + Args: + connection_config: dictionary containing the db connection details + Returns: + dict with credentials + """ + + is_replica = False + + if 'replica_host' in self.connection_config: + is_replica = True + + host = self.connection_config.get('replica_host', self.connection_config['host']) + port = int(self.connection_config.get('replica_port', self.connection_config['port'])) + user = self.connection_config.get('replica_user', self.connection_config['user']) + password = self.connection_config.get('replica_password', self.connection_config['password']) + charset = self.connection_config['charset'] + + return ({ + 'host': host, + 'port': port, + 'user': user, + 'password': password, + 'charset': charset, + }, is_replica) def open_connections(self): """ Open connection """ + + # Fastsync is using replica_{host|port|user|password} values from the config by default + # to avoid making heavy load on the primary source database when syncing large tables + # + # If replica_{host|port|user|password} values are not defined in the config then it's + # using the normal credentials to connect + + conn_params, is_replica = self.get_connection_parameters() + + self.is_replica = is_replica + self.conn = pymysql.connect( - # Fastsync is using bulk_sync_{host|port|user|password} values from the config by default - # to avoid making heavy load on the primary source database when syncing large tables - # - # If bulk_sync_{host|port|user|password} values are not defined in the config then it's - # using the normal credentials to connect - host=self.connection_config.get('bulk_sync_host', self.connection_config['host']), - port=int(self.connection_config.get('bulk_sync_port', self.connection_config['port'])), - user=self.connection_config.get('bulk_sync_user', self.connection_config['user']), - password=self.connection_config.get('bulk_sync_password', self.connection_config['password']), - charset=self.connection_config['charset'], - cursorclass=pymysql.cursors.DictCursor) + **conn_params, + cursorclass=pymysql.cursors.DictCursor, + ) self.conn_unbuffered = pymysql.connect( - # Fastsync is using bulk_sync_{host|port|user|password} values from the config by default - # to avoid making heavy load on the primary source database when syncing large tables - # - # If bulk_sync_{host|port|user|password} values are not defined in the config then it's - # using the normal credentials to connect - host=self.connection_config.get('bulk_sync_host', self.connection_config['host']), - port=int(self.connection_config.get('bulk_sync_port', self.connection_config['port'])), - user=self.connection_config.get('bulk_sync_user', self.connection_config['user']), - password=self.connection_config.get('bulk_sync_password', self.connection_config['password']), - charset=self.connection_config['charset'], - cursorclass=pymysql.cursors.SSCursor) + **conn_params, + cursorclass=pymysql.cursors.SSCursor, + ) # Set session variables by running a list of SQLs which is defined # in the optional session_sqls connection parameters @@ -84,7 +118,9 @@ def run_session_sqls(self): warnings.append(f'Could not set session variable: {sql}') if warnings: - LOGGER.warning('Encountered non-fatal errors when configuring session that could impact performance:') + LOGGER.warning( + 'Encountered non-fatal errors when configuring session that could impact performance:' + ) for warning in warnings: LOGGER.warning(warning) @@ -110,7 +146,7 @@ def query(self, query, conn=None, params=None, return_as_cursor=False, n_retry=1 conn = self.conn try: - with conn as cur: + with conn.cursor() as cur: cur.execute(query, params) if return_as_cursor: @@ -121,16 +157,22 @@ def query(self, query, conn=None, params=None, return_as_cursor=False, n_retry=1 return [] except (InterfaceError, OperationalError) as exc: - LOGGER.exception('Exception happened during running a query. Number of retries: %s. %s', n_retry, exc) + LOGGER.exception( + 'Exception happened during running a query. Number of retries: %s. %s', + n_retry, + exc, + ) if n_retry > 0: LOGGER.info('Reopening the connections.') self.close_connections(silent=True) self.open_connections() LOGGER.info('Retrying to run a query.') - return self.query(query, - params=params, - return_as_cursor=return_as_cursor, - n_retry=n_retry - 1) + return self.query( + query, + params=params, + return_as_cursor=return_as_cursor, + n_retry=n_retry - 1, + ) raise exc @@ -138,16 +180,28 @@ def fetch_current_log_pos(self): """ Get the actual binlog position in MySQL """ - result = self.query('SHOW MASTER STATUS') - if len(result) == 0: - raise Exception('MySQL binary logging is not enabled.') + if self.is_replica: + result = self.query('SHOW SLAVE STATUS') + if len(result) == 0: + raise Exception('MySQL binary logging is not enabled.') + binlog_pos = result[0] + log_file = binlog_pos.get('Master_Log_File') + log_pos = binlog_pos.get('Read_Master_Log_Pos') + version = binlog_pos.get('version', 1) - binlog_pos = result[0] + else: + result = self.query('SHOW MASTER STATUS') + if len(result) == 0: + raise Exception('MySQL binary logging is not enabled.') + binlog_pos = result[0] + log_file = binlog_pos.get('File') + log_pos = binlog_pos.get('Position') + version = binlog_pos.get('version', 1) return { - 'log_file': binlog_pos.get('File'), - 'log_pos': binlog_pos.get('Position'), - 'version': binlog_pos.get('version', 1) + 'log_file': log_file, + 'log_pos': log_pos, + 'version': version, } # pylint: disable=invalid-name @@ -155,9 +209,13 @@ def fetch_current_incremental_key_pos(self, table, replication_key): """ Get the actual incremental key position in the table """ - result = self.query('SELECT MAX({}) AS key_value FROM {}'.format(replication_key, table)) + result = self.query( + 'SELECT MAX({}) AS key_value FROM {}'.format(replication_key, table) + ) if len(result) == 0: - raise Exception('Cannot get replication key value for table: {}'.format(table)) + raise Exception( + 'Cannot get replication key value for table: {}'.format(table) + ) mysql_key_value = result[0].get('key_value') key_value = mysql_key_value @@ -175,7 +233,7 @@ def fetch_current_incremental_key_pos(self, table, replication_key): return { 'replication_key': replication_key, 'replication_key_value': key_value, - 'version': 1 + 'version': 1, } def get_primary_keys(self, table_name): @@ -183,11 +241,15 @@ def get_primary_keys(self, table_name): Get the primary key of a table """ table_dict = utils.tablename_to_dict(table_name) - sql = "SHOW KEYS FROM `{}`.`{}` WHERE Key_name = 'PRIMARY'".format(table_dict['schema_name'], - table_dict['table_name']) + sql = "SHOW KEYS FROM `{}`.`{}` WHERE Key_name = 'PRIMARY'".format( + table_dict['schema_name'], table_dict['table_name'] + ) pk_specs = self.query(sql) if len(pk_specs) > 0: - return [safe_column_name(k.get('Column_name'), self.target_quote) for k in pk_specs] + return [ + safe_column_name(k.get('Column_name'), self.target_quote) + for k in pk_specs + ] return None @@ -215,10 +277,10 @@ def get_table_columns(self, table_name, max_num=None, date_type='date'): table_name = table_dict.get('table_name') sql = f""" - SELECT column_name, - data_type, - column_type, - safe_sql_value + SELECT column_name AS column_name, + data_type AS data_type, + column_type AS column_type, + safe_sql_value AS safe_sql_value FROM (SELECT column_name, data_type, column_type, @@ -251,7 +313,7 @@ def get_table_columns(self, table_name, max_num=None, date_type='date'): AND table_name = '{table_name}') x ORDER BY ordinal_position - """ + """ # noqa: E501 return self.query(sql) def map_column_types_to_target(self, table_name): @@ -260,26 +322,32 @@ def map_column_types_to_target(self, table_name): """ mysql_columns = self.get_table_columns(table_name) mapped_columns = [ - '{} {}'.format(safe_column_name(pc.get('column_name'), self.target_quote), - self.tap_type_to_target_type(pc.get('data_type'), pc.get('column_type'))) - for pc in mysql_columns] + '{} {}'.format( + safe_column_name(pc.get('column_name'), self.target_quote), + self.tap_type_to_target_type( + pc.get('data_type'), pc.get('column_type') + ), + ) + for pc in mysql_columns + ] return { 'columns': mapped_columns, - 'primary_key': self.get_primary_keys(table_name) + 'primary_key': self.get_primary_keys(table_name), } # pylint: disable=too-many-locals - def copy_table(self, - table_name, - path, - max_num=None, - date_type='date', - split_large_files=False, - split_file_chunk_size_mb=1000, - split_file_max_chunks=20, - compress=True, - ): + def copy_table( + self, + table_name, + path, + max_num=None, + date_type='date', + split_large_files=False, + split_file_chunk_size_mb=1000, + split_file_max_chunks=20, + compress=True, + ): """ Export data from table to a zipped csv Args: @@ -303,24 +371,30 @@ def copy_table(self, ,CONVERT_TZ( NOW(),@@session.time_zone,'+00:00') AS _SDC_BATCHED_AT ,null AS _SDC_DELETED_AT FROM `{}`.`{}` - """.format(','.join(column_safe_sql_values), - table_dict['schema_name'], - table_dict['table_name']) + """.format( + ','.join(column_safe_sql_values), + table_dict['schema_name'], + table_dict['table_name'], + ) export_batch_rows = self.connection_config['export_batch_rows'] exported_rows = 0 - with self.conn_unbuffered as cur: + with self.conn_unbuffered.cursor() as cur: cur.execute(sql) - gzip_splitter = split_gzip.open(path, - mode='wt', - chunk_size_mb=split_file_chunk_size_mb, - max_chunks=split_file_max_chunks if split_large_files else 0, - compress=compress) + gzip_splitter = split_gzip.open( + path, + mode='wt', + chunk_size_mb=split_file_chunk_size_mb, + max_chunks=split_file_max_chunks if split_large_files else 0, + compress=compress, + ) with gzip_splitter as split_gzip_files: - writer = csv.writer(split_gzip_files, - delimiter=',', - quotechar='"', - quoting=csv.QUOTE_MINIMAL) + writer = csv.writer( + split_gzip_files, + delimiter=',', + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + ) while True: rows = cur.fetchmany(export_batch_rows) @@ -335,9 +409,14 @@ def copy_table(self, # Then we believe this to be just an interim batch and not the final one so report on progress LOGGER.info( - 'Exporting batch from %s to %s rows from %s...', (exported_rows - export_batch_rows), - exported_rows, table_name) + 'Exporting batch from %s to %s rows from %s...', + (exported_rows - export_batch_rows), + exported_rows, + table_name, + ) # Write rows to file in one go writer.writerows(rows) - LOGGER.info('Exported total of %s rows from %s...', exported_rows, table_name) + LOGGER.info( + 'Exported total of %s rows from %s...', exported_rows, table_name + ) diff --git a/pipelinewise/fastsync/commons/tap_postgres.py b/pipelinewise/fastsync/commons/tap_postgres.py index b5e7285d1..66b094e72 100644 --- a/pipelinewise/fastsync/commons/tap_postgres.py +++ b/pipelinewise/fastsync/commons/tap_postgres.py @@ -52,7 +52,12 @@ def generate_replication_slot_name(dbname, tap_id=None, prefix='pipelinewise'): return re.sub('[^a-z0-9_]', '_', slot_name) @classmethod - def __get_slot_name(cls, connection, dbname: str, tap_id: str,) -> str: + def __get_slot_name( + cls, + connection, + dbname: str, + tap_id: str, + ) -> str: """ Finds the right slot name to use and returns it @@ -74,7 +79,9 @@ def __get_slot_name(cls, connection, dbname: str, tap_id: str,) -> str: try: # Backward compatibility: try to locate existing v15 slot first. PPW <= 0.15.0 with connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - cur.execute(f"SELECT * FROM pg_replication_slots WHERE slot_name = '{slot_name_v15}';") + cur.execute( + f"SELECT * FROM pg_replication_slots WHERE slot_name = '{slot_name_v15}';" + ) v15_slots_count = cur.rowcount except psycopg2.Error: @@ -102,13 +109,17 @@ def drop_slot(cls, connection_config: Dict) -> None: LOGGER.debug('Connection to Primary server created.') try: - slot_name = cls.__get_slot_name(connection, connection_config['dbname'], connection_config['tap_id']) + slot_name = cls.__get_slot_name( + connection, connection_config['dbname'], connection_config['tap_id'] + ) LOGGER.info('Dropping the slot "%s"', slot_name) # drop the replication host with connection.cursor() as cur: - cur.execute(f'SELECT pg_drop_replication_slot(slot_name) ' - f"FROM pg_replication_slots WHERE slot_name = '{slot_name}';") + cur.execute( + f'SELECT pg_drop_replication_slot(slot_name) ' + f"FROM pg_replication_slots WHERE slot_name = '{slot_name}';" + ) LOGGER.info('Number of dropped slots: %s', cur.rowcount) finally: @@ -135,7 +146,8 @@ def get_connection(cls, connection_config: Dict, prioritize_primary: bool = Fals connection_config['port'], connection_config['user'], connection_config['password'], - connection_config['dbname']) + connection_config['dbname'], + ) else: LOGGER.info('Connecting to replica') conn_string = template.format( @@ -147,8 +159,11 @@ def get_connection(cls, connection_config: Dict, prioritize_primary: bool = Fals connection_config.get('replica_host', connection_config['host']), connection_config.get('replica_port', connection_config['port']), connection_config.get('replica_user', connection_config['user']), - connection_config.get('replica_password', connection_config['password']), - connection_config['dbname']) + connection_config.get( + 'replica_password', connection_config['password'] + ), + connection_config['dbname'], + ) if 'ssl' in connection_config and connection_config['ssl'] == 'true': conn_string += " sslmode='require'" @@ -166,7 +181,9 @@ def open_connection(self): """ Open connection """ - self.conn = self.get_connection(self.connection_config, prioritize_primary=False) + self.conn = self.get_connection( + self.connection_config, prioritize_primary=False + ) self.curr = self.conn.cursor() def close_connection(self): @@ -221,12 +238,16 @@ def create_replication_slot(self): replication slot and full-resync the new taps. """ try: - slot_name = self.__get_slot_name(self.primary_host_conn, - self.connection_config['dbname'], - self.connection_config['tap_id']) + slot_name = self.__get_slot_name( + self.primary_host_conn, + self.connection_config['dbname'], + self.connection_config['tap_id'], + ) # Create the replication host - self.primary_host_query(f"SELECT * FROM pg_create_logical_replication_slot('{slot_name}', 'wal2json')") + self.primary_host_query( + f"SELECT * FROM pg_create_logical_replication_slot('{slot_name}', 'wal2json')" + ) except Exception as exc: # ERROR: replication slot already exists SQL state: 42710 if hasattr(exc, 'pgcode') and exc.pgcode == '42710': @@ -241,12 +262,15 @@ def fetch_current_log_pos(self): """ # Create replication slot dedicated connection # Always use Primary server for creating replication_slot - self.primary_host_conn = self.get_connection(self.connection_config, prioritize_primary=True) + self.primary_host_conn = self.get_connection( + self.connection_config, prioritize_primary=True + ) self.primary_host_curr = self.primary_host_conn.cursor() # Make sure PostgreSQL version is 9.4 or higher result = self.primary_host_query( - "SELECT setting::int AS version FROM pg_settings WHERE name='server_version_num'") + "SELECT setting::int AS version FROM pg_settings WHERE name='server_version_num'" + ) version = result[0].get('version') # Do not allow minor versions with PostgreSQL BUG #15114 @@ -275,9 +299,13 @@ def fetch_current_log_pos(self): if version >= 100000: result = self.query('SELECT pg_last_wal_replay_lsn() AS current_lsn') elif version >= 90400: - result = self.query('SELECT pg_last_xlog_replay_location() AS current_lsn') + result = self.query( + 'SELECT pg_last_xlog_replay_location() AS current_lsn' + ) else: - raise Exception('Logical replication not supported before PostgreSQL 9.4') + raise Exception( + 'Logical replication not supported before PostgreSQL 9.4' + ) else: # Get current lsn from primary host if version >= 100000: @@ -285,16 +313,15 @@ def fetch_current_log_pos(self): elif version >= 90400: result = self.query('SELECT pg_current_xlog_location() AS current_lsn') else: - raise Exception('Logical replication not supported before PostgreSQL 9.4') + raise Exception( + 'Logical replication not supported before PostgreSQL 9.4' + ) current_lsn = result[0].get('current_lsn') file, index = current_lsn.split('/') lsn = (int(file, 16) << 32) + int(index, 16) - return { - 'lsn': lsn, - 'version': 1 - } + return {'lsn': lsn, 'version': 1} # pylint: disable=invalid-name def fetch_current_incremental_key_pos(self, table, replication_key): @@ -302,9 +329,13 @@ def fetch_current_incremental_key_pos(self, table, replication_key): Get the actual incremental key position in the table """ schema_name, table_name = table.split('.') - result = self.query(f'SELECT MAX({replication_key}) AS key_value FROM {schema_name}."{table_name}"') + result = self.query( + f'SELECT MAX({replication_key}) AS key_value FROM {schema_name}."{table_name}"' + ) if len(result) == 0: - raise Exception('Cannot get replication key value for table: {}'.format(table)) + raise Exception( + 'Cannot get replication key value for table: {}'.format(table) + ) postgres_key_value = result[0].get('key_value') key_value = postgres_key_value @@ -322,7 +353,7 @@ def fetch_current_incremental_key_pos(self, table, replication_key): return { 'replication_key': replication_key, 'replication_key_value': key_value, - 'version': 1 + 'version': 1, } def get_primary_keys(self, table): @@ -339,7 +370,9 @@ def get_primary_keys(self, table): pg_class.relnamespace = pg_namespace.oid AND pg_attribute.attrelid = pg_class.oid AND pg_attribute.attnum = any(pg_index.indkey) - AND indisprimary""".format(schema_name, table_name) + AND indisprimary""".format( + schema_name, table_name + ) pk_specs = self.query(sql) if len(pk_specs) > 0: return [safe_column_name(k[0], self.target_quote) for k in pk_specs] @@ -356,7 +389,7 @@ def get_table_columns(self, table_name, max_num=None, date_type='date'): decimals = len(max_num.split('.')[1]) if '.' in max_num else 0 decimal_format = f""" 'CASE WHEN "' || column_name || '" IS NULL THEN NULL ELSE GREATEST(LEAST({max_num}, ROUND("' || column_name || '"::numeric , {decimals})), -{max_num}) END' - """ + """ # noqa E501 integer_format = """ '"' || column_name || '"' """ @@ -380,7 +413,10 @@ def get_table_columns(self, table_name, max_num=None, date_type='date'): data_type, CASE WHEN data_type = 'ARRAY' THEN 'array_to_json("' || column_name || '") AS ' || column_name - WHEN data_type = 'date' THEN column_name || '::{date_type} AS ' || column_name + WHEN data_type = 'date' THEN + 'CASE WHEN "' ||column_name|| E'" < \\'0001-01-01\\' ' + 'OR "' ||column_name|| E'" > \\'9999-12-31\\' THEN \\'9999-12-31\\' ' + 'ELSE "' ||column_name|| '"::{date_type} END AS "' ||column_name|| '"' WHEN udt_name = 'time' THEN 'replace("' || column_name || E'"::varchar,\\\'24:00:00\\\',\\\'00:00:00\\\') AS ' || column_name WHEN udt_name = 'timetz' THEN 'replace(("' || column_name || E'" at time zone \'\'UTC\'\')::time::varchar,\\\'24:00:00\\\',\\\'00:00:00\\\') AS ' || column_name WHEN udt_name in ('timestamp', 'timestamptz') THEN @@ -397,7 +433,7 @@ def get_table_columns(self, table_name, max_num=None, date_type='date'): AND table_name = '{table_name}' ORDER BY ordinal_position ) AS x - """ + """ # noqa: E501 return self.query(sql) def map_column_types_to_target(self, table_name): @@ -412,25 +448,28 @@ def map_column_types_to_target(self, table_name): # most targets would want to map length 1 to boolean and the rest to number if isinstance(column_type, list): column_type = column_type[1 if pc[3] > 1 else 0] - mapping = '{} {}'.format(safe_column_name(pc[0], self.target_quote), column_type) + mapping = '{} {}'.format( + safe_column_name(pc[0], self.target_quote), column_type + ) mapped_columns.append(mapping) return { 'columns': mapped_columns, - 'primary_key': self.get_primary_keys(table_name) + 'primary_key': self.get_primary_keys(table_name), } # pylint: disable=too-many-arguments - def copy_table(self, - table_name, - path, - max_num=None, - date_type='date', - split_large_files=False, - split_file_chunk_size_mb=1000, - split_file_max_chunks=20, - compress=True - ): + def copy_table( + self, + table_name, + path, + max_num=None, + date_type='date', + split_large_files=False, + split_file_chunk_size_mb=1000, + split_file_max_chunks=20, + compress=True, + ): """ Export data from table to a zipped csv Args: @@ -455,14 +494,18 @@ def copy_table(self, ,now() AT TIME ZONE 'UTC' ,null FROM {}."{}") TO STDOUT with CSV DELIMITER ',' - """.format(','.join(column_safe_sql_values), schema_name, table_name) + """.format( + ','.join(column_safe_sql_values), schema_name, table_name + ) LOGGER.info('Exporting data: %s', sql) - gzip_splitter = split_gzip.open(path, - mode='wb', - chunk_size_mb=split_file_chunk_size_mb, - max_chunks=split_file_max_chunks if split_large_files else 0, - compress=compress) + gzip_splitter = split_gzip.open( + path, + mode='wb', + chunk_size_mb=split_file_chunk_size_mb, + max_chunks=split_file_max_chunks if split_large_files else 0, + compress=compress, + ) with gzip_splitter as split_gzip_files: self.curr.copy_expert(sql, split_gzip_files, size=131072) diff --git a/pipelinewise/fastsync/commons/tap_s3_csv.py b/pipelinewise/fastsync/commons/tap_s3_csv.py index e23f60e42..772982a17 100644 --- a/pipelinewise/fastsync/commons/tap_s3_csv.py +++ b/pipelinewise/fastsync/commons/tap_s3_csv.py @@ -9,7 +9,14 @@ from datetime import datetime from time import struct_time from typing import Callable, Dict, List, Optional, Set -from messytables import (CSVTableSet, headers_guess, headers_processor, jts, offset_processor, type_guess) +from messytables import ( + CSVTableSet, + headers_guess, + headers_processor, + jts, + offset_processor, + type_guess, +) from singer.utils import strptime_with_tz from singer_encodings import csv as singer_encodings_csv @@ -26,7 +33,12 @@ class FastSyncTapS3Csv: """ # pylint: disable=bare-except - def __init__(self, connection_config: Dict, tap_type_to_target_type: Callable, target_quote=None): + def __init__( + self, + connection_config: Dict, + tap_type_to_target_type: Callable, + target_quote=None, + ): """ Constructor :param connection_config: tap connection config @@ -34,10 +46,16 @@ def __init__(self, connection_config: Dict, tap_type_to_target_type: Callable, t """ try: # Check if bucket can be accessed without credentials/assuming role - list(S3Helper.list_files_in_bucket(connection_config['bucket'], - connection_config.get('aws_endpoint_url', None))) - LOGGER.info('I have direct access to the bucket without assuming the configured role.') - except: + list( + S3Helper.list_files_in_bucket( + connection_config['bucket'], + connection_config.get('aws_endpoint_url', None), + ) + ) + LOGGER.info( + 'I have direct access to the bucket without assuming the configured role.' + ) + except Exception: # Setup AWS session S3Helper.setup_aws_client(connection_config) @@ -48,7 +66,12 @@ def __init__(self, connection_config: Dict, tap_type_to_target_type: Callable, t def _find_table_spec_by_name(self, table_name: str) -> Dict: # look in tables array for the full specs dict of given table - return next(filter(lambda x: x['table_name'] == table_name, self.connection_config['tables'])) + return next( + filter( + lambda x: x['table_name'] == table_name, + self.connection_config['tables'], + ) + ) def copy_table(self, table_name: str, file_path: str) -> None: """ @@ -67,7 +90,9 @@ def copy_table(self, table_name: str, file_path: str) -> None: modified_since = strptime_with_tz(self.connection_config['start_date']) # get all the files in the bucket that match the criteria and were modified after start date - s3_files = S3Helper.get_input_files_for_table(self.connection_config, table_spec, modified_since) + s3_files = S3Helper.get_input_files_for_table( + self.connection_config, table_spec, modified_since + ) # variable to hold all the records from all matching files records = [] @@ -84,7 +109,10 @@ def copy_table(self, table_name: str, file_path: str) -> None: self._get_file_records(s3_file['key'], table_spec, records, headers) # check if the current file has the most recent modification date - if max_last_modified is None or max_last_modified < s3_file['last_modified']: + if ( + max_last_modified is None + or max_last_modified < s3_file['last_modified'] + ): max_last_modified = s3_file['last_modified'] # add the found last modified date to the dictionary @@ -93,19 +121,23 @@ def copy_table(self, table_name: str, file_path: str) -> None: # write to the given compressed csv file with gzip.open(file_path, 'wt') as gzfile: - writer = csv.DictWriter(gzfile, - fieldnames=sorted(list(headers)), - # we need to sort the headers so that copying into snowflake works - delimiter=',', - quotechar='"', - quoting=csv.QUOTE_MINIMAL) + writer = csv.DictWriter( + gzfile, + fieldnames=sorted(list(headers)), + # we need to sort the headers so that copying into snowflake works + delimiter=',', + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + ) # write the header writer.writeheader() # write all records at once writer.writerows(records) # pylint: disable=too-many-locals - def _get_file_records(self, s3_path: str, table_spec: Dict, records: List[Dict], headers: Set) -> None: + def _get_file_records( + self, s3_path: str, table_spec: Dict, records: List[Dict], headers: Set + ) -> None: """ Reads the file in s3_path and inserts the rows in records :param config: tap connection configuration @@ -129,7 +161,9 @@ def _get_file_records(self, s3_path: str, table_spec: Dict, records: List[Dict], csv.field_size_limit(sys.maxsize) # pylint:disable=protected-access - iterator = singer_encodings_csv.get_row_iterator(s3_file_handle._raw_stream, table_spec) + iterator = singer_encodings_csv.get_row_iterator( + s3_file_handle._raw_stream, table_spec + ) records_copied = len(records) @@ -141,7 +175,7 @@ def _get_file_records(self, s3_path: str, table_spec: Dict, records: List[Dict], S3Helper.SDC_SOURCE_LINENO_COLUMN: records_copied + 1, '_SDC_EXTRACTED_AT': now_datetime, '_SDC_BATCHED_AT': now_datetime, - '_SDC_DELETED_AT': None + '_SDC_DELETED_AT': None, } new_row = {} @@ -171,20 +205,26 @@ def map_column_types_to_target(self, filepath: str, table: str): # use timestamp as a type instead if column is set in date_overrides configuration mapped_columns = [] - date_overrides = None if 'date_overrides' not in specs \ - else {safe_column_name(c, self.target_quote) for c in specs['date_overrides']} + date_overrides = ( + None + if 'date_overrides' not in specs + else { + safe_column_name(c, self.target_quote) for c in specs['date_overrides'] + } + ) for column_name, column_type in csv_columns: if date_overrides and column_name in date_overrides: - mapped_columns.append(f'{column_name} {self.tap_type_to_target_type("date_override")}') + mapped_columns.append( + f'{column_name} {self.tap_type_to_target_type("date_override")}' + ) else: - mapped_columns.append(f'{column_name} {self.tap_type_to_target_type(column_type)}') + mapped_columns.append( + f'{column_name} {self.tap_type_to_target_type(column_type)}' + ) - return { - 'columns': mapped_columns, - 'primary_key': self._get_primary_keys(specs) - } + return {'columns': mapped_columns, 'primary_key': self._get_primary_keys(specs)} def _get_table_columns(self, csv_file_path: str) -> zip: """ @@ -203,12 +243,15 @@ def _get_table_columns(self, csv_file_path: str) -> zip: row_set.register_processor(offset_processor(offset + 1)) - types = list(map(jts.celltype_as_string, type_guess(row_set.sample, strict=True))) + types = list( + map(jts.celltype_as_string, type_guess(row_set.sample, strict=True)) + ) return zip(headers, types) # pylint: disable=invalid-name - def fetch_current_incremental_key_pos(self, table: str, - replication_key: Optional[str] = 'modified_since') -> Optional[Dict]: + def fetch_current_incremental_key_pos( + self, table: str, replication_key: Optional[str] = 'modified_since' + ) -> Optional[Dict]: """ Returns the last time a the table has been modified in ISO format. :param table: table name @@ -217,9 +260,11 @@ def fetch_current_incremental_key_pos(self, table: str, """ replication_key = 'modified_since' - return { - replication_key: self.tables_last_modified[table].isoformat() - } if table in self.tables_last_modified else {} + return ( + {replication_key: self.tables_last_modified[table].isoformat()} + if table in self.tables_last_modified + else {} + ) def _get_primary_keys(self, table_specs: Dict) -> Optional[List]: """ @@ -229,7 +274,10 @@ def _get_primary_keys(self, table_specs: Dict) -> Optional[List]: :return: the keys concatenated and separated by comma if keys are given, otherwise None """ if table_specs.get('key_properties', False): - return [safe_column_name(k, self.target_quote) for k in table_specs['key_properties']] + return [ + safe_column_name(k, self.target_quote) + for k in table_specs['key_properties'] + ] return None @@ -240,6 +288,7 @@ class S3Helper: """ S3 helper methods """ + SDC_SOURCE_BUCKET_COLUMN = '_sdc_source_bucket' SDC_SOURCE_FILE_COLUMN = '_sdc_source_file' SDC_SOURCE_LINENO_COLUMN = '_sdc_source_lineno' @@ -254,9 +303,15 @@ def setup_aws_client(cls, config: Dict) -> None: LOGGER.info('Attempting to create AWS session') # Get the required parameters from config file and/or environment variables - aws_access_key_id = config.get('aws_access_key_id') or os.environ.get('AWS_ACCESS_KEY_ID') - aws_secret_access_key = config.get('aws_secret_access_key') or os.environ.get('AWS_SECRET_ACCESS_KEY') - aws_session_token = config.get('aws_session_token') or os.environ.get('AWS_SESSION_TOKEN') + aws_access_key_id = config.get('aws_access_key_id') or os.environ.get( + 'AWS_ACCESS_KEY_ID' + ) + aws_secret_access_key = config.get('aws_secret_access_key') or os.environ.get( + 'AWS_SECRET_ACCESS_KEY' + ) + aws_session_token = config.get('aws_session_token') or os.environ.get( + 'AWS_SESSION_TOKEN' + ) aws_profile = config.get('aws_profile') or os.environ.get('AWS_PROFILE') # AWS credentials based authentication @@ -264,14 +319,16 @@ def setup_aws_client(cls, config: Dict) -> None: boto3.setup_default_session( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, - aws_session_token=aws_session_token + aws_session_token=aws_session_token, ) # AWS Profile based authentication, will use IAM role if no profile is found else: boto3.setup_default_session(profile_name=aws_profile) @classmethod - def get_input_files_for_table(cls, config: Dict, table_spec: Dict, modified_since: struct_time = None): + def get_input_files_for_table( + cls, config: Dict, table_spec: Dict, modified_since: struct_time = None + ): bucket = config['bucket'] prefix = table_spec.get('search_prefix') pattern = table_spec['search_pattern'] @@ -279,10 +336,14 @@ def get_input_files_for_table(cls, config: Dict, table_spec: Dict, modified_sinc try: matcher = re.compile(pattern) except re.error as exc: - raise ValueError((f'search_pattern for table `{table_spec["table_name"]}` is not a valid regular ' - 'expression. See ' - 'https://docs.python.org/3.5/library/re.html#regular-expression-syntax'), - pattern) from exc + raise ValueError( + ( + f'search_pattern for table `{table_spec["table_name"]}` is not a valid regular ' + 'expression. See ' + 'https://docs.python.org/3.5/library/re.html#regular-expression-syntax' + ), + pattern, + ) from exc LOGGER.info('Checking bucket "%s" for keys matching "%s"', bucket, pattern) @@ -290,7 +351,9 @@ def get_input_files_for_table(cls, config: Dict, table_spec: Dict, modified_sinc unmatched_files_count = 0 max_files_before_log = 30000 - for s3_object in cls.list_files_in_bucket(bucket, prefix, aws_endpoint_url=config.get('aws_endpoint_url')): + for s3_object in cls.list_files_in_bucket( + bucket, prefix, aws_endpoint_url=config.get('aws_endpoint_url') + ): key = s3_object['Key'] last_modified = s3_object['LastModified'] @@ -302,29 +365,48 @@ def get_input_files_for_table(cls, config: Dict, table_spec: Dict, modified_sinc if matcher.search(key): matched_files_count += 1 if modified_since is None or modified_since < last_modified: - LOGGER.info('Will download key "%s" as it was last modified %s', key, last_modified) + LOGGER.info( + 'Will download key "%s" as it was last modified %s', + key, + last_modified, + ) yield {'key': key, 'last_modified': last_modified} else: unmatched_files_count += 1 - if (unmatched_files_count + matched_files_count) % max_files_before_log == 0: + if ( + unmatched_files_count + matched_files_count + ) % max_files_before_log == 0: # Are we skipping greater than 50% of the files? # pylint: disable=old-division - if (unmatched_files_count / (matched_files_count + unmatched_files_count)) > 0.5: - LOGGER.info('Found %s matching files and %s non-matching files. ' - 'You should consider adding a `search_prefix` to the config ' - 'or removing non-matching files from the bucket.', - matched_files_count, unmatched_files_count) + if ( + unmatched_files_count + / (matched_files_count + unmatched_files_count) + ) > 0.5: + LOGGER.info( + 'Found %s matching files and %s non-matching files. ' + 'You should consider adding a `search_prefix` to the config ' + 'or removing non-matching files from the bucket.', + matched_files_count, + unmatched_files_count, + ) else: - LOGGER.info('Found %s matching files and %s non-matching files', - matched_files_count, unmatched_files_count) + LOGGER.info( + 'Found %s matching files and %s non-matching files', + matched_files_count, + unmatched_files_count, + ) if matched_files_count == 0: if prefix: - raise Exception(f'No files found in bucket "{bucket}" ' - f'that matches prefix "{prefix}" and pattern "{pattern}"') + raise Exception( + f'No files found in bucket "{bucket}" ' + f'that matches prefix "{prefix}" and pattern "{pattern}"' + ) - raise Exception(f'No files found in bucket "{bucket}" that matches pattern "{pattern}"') + raise Exception( + f'No files found in bucket "{bucket}" that matches pattern "{pattern}"' + ) @classmethod @retry_pattern() @@ -356,7 +438,11 @@ def list_files_in_bucket(cls, bucket, search_prefix=None, aws_endpoint_url=None) if s3_object_count > 0: LOGGER.info('Found %s files.', s3_object_count) else: - LOGGER.info('Found no files for bucket "%s" that match prefix "%s"', bucket, search_prefix) + LOGGER.info( + 'Found no files for bucket "%s" that match prefix "%s"', + bucket, + search_prefix, + ) @classmethod @retry_pattern() diff --git a/pipelinewise/fastsync/commons/target_bigquery.py b/pipelinewise/fastsync/commons/target_bigquery.py index 60ceb226d..aa5a10a1e 100644 --- a/pipelinewise/fastsync/commons/target_bigquery.py +++ b/pipelinewise/fastsync/commons/target_bigquery.py @@ -33,6 +33,7 @@ class FastSyncTargetBigquery: """ Common functions for fastsync to BigQuery """ + def __init__(self, connection_config, transformation_config=None): self.connection_config = connection_config self.transformation_config = transformation_config @@ -48,7 +49,7 @@ def to_query_parameter(value): value_type = 'INT64' elif isinstance(value, float): value_type = 'NUMERIC' - #TODO: repeated float here and in target + # TODO: repeated float here and in target elif isinstance(value, float): value_type = 'FLOAT64' elif isinstance(value, bool): @@ -91,27 +92,45 @@ def create_schema(self, schema_name): def drop_table(self, target_schema, table_name, is_temporary=False): table_dict = utils.tablename_to_dict(table_name) - target_table = safe_name(table_dict.get('table_name' if not is_temporary else 'temp_table_name')) + target_table = safe_name( + table_dict.get('table_name' if not is_temporary else 'temp_table_name') + ) sql = 'DROP TABLE IF EXISTS {}.{}'.format(target_schema, target_table.lower()) self.query(sql) - def create_table(self, target_schema: str, table_name: str, columns: List[str], - is_temporary: bool = False, sort_columns=False): + def create_table( + self, + target_schema: str, + table_name: str, + columns: List[str], + is_temporary: bool = False, + sort_columns=False, + ): table_dict = utils.tablename_to_dict(table_name) - target_table = safe_name(table_dict.get('table_name' if not is_temporary else 'temp_table_name').lower()) + target_table = safe_name( + table_dict.get( + 'table_name' if not is_temporary else 'temp_table_name' + ).lower() + ) # skip the EXTRACTED, BATCHED and DELETED columns in case they exist because they gonna be added later - columns = [c for c in columns if not ( - c.upper().startswith(utils.SDC_EXTRACTED_AT.upper()) or - c.upper().startswith(utils.SDC_BATCHED_AT.upper()) or - c.upper().startswith(utils.SDC_DELETED_AT.upper()))] - - columns += [f'{utils.SDC_EXTRACTED_AT} TIMESTAMP', - f'{utils.SDC_BATCHED_AT} TIMESTAMP', - f'{utils.SDC_DELETED_AT} TIMESTAMP' - ] + columns = [ + c + for c in columns + if not ( + c.upper().startswith(utils.SDC_EXTRACTED_AT.upper()) + or c.upper().startswith(utils.SDC_BATCHED_AT.upper()) + or c.upper().startswith(utils.SDC_DELETED_AT.upper()) + ) + ] + + columns += [ + f'{utils.SDC_EXTRACTED_AT} TIMESTAMP', + f'{utils.SDC_BATCHED_AT} TIMESTAMP', + f'{utils.SDC_DELETED_AT} TIMESTAMP', + ] # We need the sort the columns for some taps( for now tap-s3-csv) # because later on when copying a csv file into Snowflake @@ -121,18 +140,33 @@ def create_table(self, target_schema: str, table_name: str, columns: List[str], columns = [c.lower() for c in columns] - sql = f'CREATE OR REPLACE TABLE {target_schema}.{target_table} (' \ - f'{",".join(columns)})' + sql = ( + f'CREATE OR REPLACE TABLE {target_schema}.{target_table} (' + f'{",".join(columns)})' + ) self.query(sql) # pylint: disable=R0913,R0914 - def copy_to_table(self, filepath, target_schema, table_name, size_bytes, is_temporary, - skip_csv_header=False, allow_quoted_newlines=True, write_truncate=True): + def copy_to_table( + self, + filepath, + target_schema, + table_name, + size_bytes, + is_temporary, + skip_csv_header=False, + allow_quoted_newlines=True, + write_truncate=True, + ): LOGGER.info('BIGQUERY - Loading %s into Bigquery...', filepath) table_dict = utils.tablename_to_dict(table_name) - target_table = safe_name(table_dict.get('table_name' if not is_temporary else 'temp_table_name').lower(), - quotes=False) + target_table = safe_name( + table_dict.get( + 'table_name' if not is_temporary else 'temp_table_name' + ).lower(), + quotes=False, + ) client = self.open_connection() dataset_ref = client.dataset(target_schema) @@ -141,11 +175,15 @@ def copy_to_table(self, filepath, target_schema, table_name, size_bytes, is_temp job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.CSV job_config.schema = table_schema - job_config.write_disposition = 'WRITE_TRUNCATE' if write_truncate else 'WRITE_APPEND' + job_config.write_disposition = ( + 'WRITE_TRUNCATE' if write_truncate else 'WRITE_APPEND' + ) job_config.allow_quoted_newlines = allow_quoted_newlines job_config.skip_leading_rows = 1 if skip_csv_header else 0 with open(filepath, 'rb') as exported_data: - job = client.load_table_from_file(exported_data, table_ref, job_config=job_config) + job = client.load_table_from_file( + exported_data, table_ref, job_config=job_config + ) try: job.result() except exceptions.BadRequest as exc: @@ -156,10 +194,12 @@ def copy_to_table(self, filepath, target_schema, table_name, size_bytes, is_temp LOGGER.info('Job %s', job) LOGGER.info('Job.output_rows %s', job.output_rows) inserts = job.output_rows - LOGGER.info('Loading into %s."%s": %s', - target_schema, - target_table, - json.dumps({'inserts': inserts, 'updates': 0, 'size_bytes': size_bytes})) + LOGGER.info( + 'Loading into %s."%s": %s', + target_schema, + target_table, + json.dumps({'inserts': inserts, 'updates': 0, 'size_bytes': size_bytes}), + ) LOGGER.info(job.errors) @@ -167,12 +207,18 @@ def copy_to_table(self, filepath, target_schema, table_name, size_bytes, is_temp # "to_group" is not used here but exists for compatibility reasons with other database types # "to_group" is for databases that can grant to users and groups separately like Amazon Redshift # pylint: disable=unused-argument - def grant_select_on_table(self, target_schema, table_name, role, is_temporary, to_group=False): + def grant_select_on_table( + self, target_schema, table_name, role, is_temporary, to_group=False + ): # Grant role is not mandatory parameter, do nothing if not specified if role: table_dict = utils.tablename_to_dict(table_name) - target_table = safe_name(table_dict.get('table_name' if not is_temporary else 'temp_table_name')) - sql = 'GRANT SELECT ON {}.{} TO ROLE {}'.format(target_schema, target_table, role) + target_table = safe_name( + table_dict.get('table_name' if not is_temporary else 'temp_table_name') + ) + sql = 'GRANT SELECT ON {}.{} TO ROLE {}'.format( + target_schema, target_table, role + ) self.query(sql) # pylint: disable=unused-argument @@ -186,7 +232,9 @@ def grant_usage_on_schema(self, target_schema, role, to_group=False): def grant_select_on_schema(self, target_schema, role, to_group=False): # Grant role is not mandatory parameter, do nothing if not specified if role: - sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO ROLE {}'.format(target_schema, role) + sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO ROLE {}'.format( + target_schema, role + ) self.query(sql) def obfuscate_columns(self, target_schema: str, table_name: str): @@ -207,21 +255,22 @@ def obfuscate_columns(self, target_schema: str, table_name: str): # # We need to convert to the same format to find the transformation # has that has to be applied - tap_stream_name_by_table_name = '{}-{}'.format(table_dict['schema_name'], table_dict['table_name']) \ - if table_dict['schema_name'] is not None else table_dict['table_name'] + tap_stream_name_by_table_name = ( + '{}-{}'.format(table_dict['schema_name'], table_dict['table_name']) + if table_dict['schema_name'] is not None + else table_dict['table_name'] + ) # Find obfuscation rules for the current table # trans_map = self.__get_stream_transformation_map(tap_stream_name_by_table_name, transformations) trans_map = TransformationHelper.get_trans_in_sql_flavor( - tap_stream_name_by_table_name, - transformations, - SQLFlavor('bigquery')) + tap_stream_name_by_table_name, transformations, SQLFlavor('bigquery') + ) self.__apply_transformations(trans_map, target_schema, temp_table) LOGGER.info('Obfuscation rules applied.') - def swap_tables(self, schema, table_name): project_id = self.connection_config['project_id'] table_dict = utils.tablename_to_dict(table_name) @@ -243,7 +292,9 @@ def swap_tables(self, schema, table_name): # delete the temp table client.delete_table(temp_table_id) - def __apply_transformations(self, transformations: List[Dict], target_schema: str, table_name: str) -> None: + def __apply_transformations( + self, transformations: List[Dict], target_schema: str, table_name: str + ) -> None: """ Generate and execute the SQL queries based on the given transformations. Args: @@ -251,7 +302,9 @@ def __apply_transformations(self, transformations: List[Dict], target_schema: st target_schema: name of the target schema where the table lives table_name: the table name on which we want to apply the transformations """ - full_qual_table_name = '{}.{}'.format(safe_name(target_schema), safe_name(table_name)) + full_qual_table_name = '{}.{}'.format( + safe_name(target_schema), safe_name(table_name) + ) if transformations: all_cols_update_sql = '' @@ -263,8 +316,10 @@ def __apply_transformations(self, transformations: List[Dict], target_schema: st # If we have conditions, then we need to construct the query and execute it to transform the # single column conditionally if trans_item['conditions']: - sql = f'UPDATE {full_qual_table_name} ' \ - f'SET {trans_item["trans"]} WHERE {trans_item["conditions"]};' + sql = ( + f'UPDATE {full_qual_table_name} ' + f'SET {trans_item["trans"]} WHERE {trans_item["conditions"]};' + ) self.query(sql) @@ -276,7 +331,9 @@ def __apply_transformations(self, transformations: List[Dict], target_schema: st if not all_cols_update_sql: all_cols_update_sql = trans_item['trans'] else: - all_cols_update_sql = f'{all_cols_update_sql}, {trans_item["trans"]}' + all_cols_update_sql = ( + f'{all_cols_update_sql}, {trans_item["trans"]}' + ) # If we have some non-conditional transformations then construct and execute a query if all_cols_update_sql: diff --git a/pipelinewise/fastsync/commons/target_postgres.py b/pipelinewise/fastsync/commons/target_postgres.py index 0b7f201ae..c8b025b87 100644 --- a/pipelinewise/fastsync/commons/target_postgres.py +++ b/pipelinewise/fastsync/commons/target_postgres.py @@ -32,7 +32,7 @@ def open_connection(self): self.connection_config['dbname'], self.connection_config['user'], self.connection_config['password'], - self.connection_config['port'] + self.connection_config['port'], ) if 'ssl' in self.connection_config and self.connection_config['ssl'] == 'true': @@ -62,25 +62,48 @@ def create_schemas(self, tables): def drop_table(self, target_schema, table_name, is_temporary=False): table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) sql = 'DROP TABLE IF EXISTS {}."{}"'.format(target_schema, target_table.lower()) self.query(sql) - def create_table(self, target_schema: str, table_name: str, columns: List[str], primary_key: List[str], - is_temporary: bool = False, sort_columns=False): + def create_table( + self, + target_schema: str, + table_name: str, + columns: List[str], + primary_key: List[str], + is_temporary: bool = False, + sort_columns=False, + ): table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) # skip the EXTRACTED, BATCHED and DELETED columns in case they exist because they gonna be added later - columns = [c for c in columns if not (c.startswith(self.EXTRACTED_AT_COLUMN) or - c.startswith(self.BATCHED_AT_COLUMN) or - c.startswith(self.DELETED_AT_COLUMN))] - - columns += [f'{self.EXTRACTED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', - f'{self.BATCHED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', - f'{self.DELETED_AT_COLUMN} CHARACTER VARYING'] + columns = [ + c + for c in columns + if not ( + c.startswith(self.EXTRACTED_AT_COLUMN) + or c.startswith(self.BATCHED_AT_COLUMN) + or c.startswith(self.DELETED_AT_COLUMN) + ) + ] + + columns += [ + f'{self.EXTRACTED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', + f'{self.BATCHED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', + f'{self.DELETED_AT_COLUMN} CHARACTER VARYING', + ] # We need the sort the columns for some taps( for now tap-s3-csv) # because later on when copying a csv file into Snowflake @@ -90,17 +113,30 @@ def create_table(self, target_schema: str, table_name: str, columns: List[str], sql_columns = ','.join(columns).lower() sql_primary_keys = ','.join(primary_key).lower() if primary_key else None - sql = f'CREATE TABLE IF NOT EXISTS {target_schema}."{target_table.lower()}" (' \ - f'{sql_columns}' \ - f'{f", PRIMARY KEY ({sql_primary_keys}))" if primary_key else ")"}' + sql = ( + f'CREATE TABLE IF NOT EXISTS {target_schema}."{target_table.lower()}" (' + f'{sql_columns}' + f'{f", PRIMARY KEY ({sql_primary_keys}))" if primary_key else ")"}' + ) self.query(sql) - def copy_to_table(self, filepath, target_schema: str, table_name: str, size_bytes: int, - is_temporary: bool = False, skip_csv_header: bool = False): + def copy_to_table( + self, + filepath, + target_schema: str, + table_name: str, + size_bytes: int, + is_temporary: bool = False, + skip_csv_header: bool = False, + ): LOGGER.info('Loading %s into Postgres...', filepath) table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) with self.open_connection() as connection: with connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: @@ -114,21 +150,33 @@ def copy_to_table(self, filepath, target_schema: str, table_name: str, size_byte cur.copy_expert(copy_sql, file) inserts = cur.rowcount - LOGGER.info('Loading into %s."%s": %s', - target_schema, - target_table.lower(), - json.dumps({'inserts': inserts, 'updates': 0, 'size_bytes': size_bytes})) + LOGGER.info( + 'Loading into %s."%s": %s', + target_schema, + target_table.lower(), + json.dumps( + {'inserts': inserts, 'updates': 0, 'size_bytes': size_bytes} + ), + ) # grant_... functions are common functions called by utils.py: grant_privilege function # "to_group" is not used here but exists for compatibility reasons with other database types # "to_group" is for databases that can grant to users and groups separately like Amazon Redshift # pylint: disable=unused-argument - def grant_select_on_table(self, target_schema, table_name, role, is_temporary, to_group=False): + def grant_select_on_table( + self, target_schema, table_name, role, is_temporary, to_group=False + ): # Grant role is not mandatory parameter, do nothing if not specified if role: table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') - sql = 'GRANT SELECT ON {}."{}" TO GROUP {}'.format(target_schema, target_table.lower(), role) + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) + sql = 'GRANT SELECT ON {}."{}" TO GROUP {}'.format( + target_schema, target_table.lower(), role + ) self.query(sql) # pylint: disable=unused-argument @@ -142,10 +190,14 @@ def grant_usage_on_schema(self, target_schema, role, to_group=False): def grant_select_on_schema(self, target_schema, role, to_group=False): # Grant role is not mandatory parameter, do nothing if not specified if role: - sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO GROUP {}'.format(target_schema, role) + sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO GROUP {}'.format( + target_schema, role + ) self.query(sql) - def obfuscate_columns(self, target_schema: str, table_name: str, is_temporary: bool = False): + def obfuscate_columns( + self, target_schema: str, table_name: str, is_temporary: bool = False + ): """ Apply any configured transformations to the given table Args: @@ -155,7 +207,11 @@ def obfuscate_columns(self, target_schema: str, table_name: str, is_temporary: b LOGGER.info('Starting obfuscation rules...') table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) transformations = self.transformation_config.get('transformations', []) # Input table_name is formatted as {{schema}}.{{table}} @@ -163,12 +219,13 @@ def obfuscate_columns(self, target_schema: str, table_name: str, is_temporary: b # # We need to convert to the same format to find the transformation # has that has to be applied - tap_stream_name_by_table_name = '{}-{}'.format(table_dict.get('schema_name'), table_dict.get('table_name')) + tap_stream_name_by_table_name = '{}-{}'.format( + table_dict.get('schema_name'), table_dict.get('table_name') + ) trans_cols = TransformationHelper.get_trans_in_sql_flavor( - tap_stream_name_by_table_name, - transformations, - SQLFlavor('postgres')) + tap_stream_name_by_table_name, transformations, SQLFlavor('postgres') + ) self.__apply_transformations(trans_cols, target_schema, target_table) @@ -181,7 +238,11 @@ def swap_tables(self, schema, table_name): # Swap tables and drop the temp tamp self.query('DROP TABLE IF EXISTS {}."{}"'.format(schema, target_table.lower())) - self.query('ALTER TABLE {}."{}" RENAME TO "{}"'.format(schema, temp_table.lower(), target_table.lower())) + self.query( + 'ALTER TABLE {}."{}" RENAME TO "{}"'.format( + schema, temp_table.lower(), target_table.lower() + ) + ) def __apply_transformations(self, transformations, target_schema, table_name): """ @@ -203,8 +264,10 @@ def __apply_transformations(self, transformations, target_schema, table_name): # If we have conditions, then we need to construct the query and execute it to transform the # single column conditionally if trans_item['conditions']: - sql = f'UPDATE {full_qual_table_name} ' \ - f'SET {trans_item["trans"]} WHERE {trans_item["conditions"]};' + sql = ( + f'UPDATE {full_qual_table_name} ' + f'SET {trans_item["trans"]} WHERE {trans_item["conditions"]};' + ) self.query(sql) @@ -216,10 +279,14 @@ def __apply_transformations(self, transformations, target_schema, table_name): if not all_cols_update_sql: all_cols_update_sql = trans_item['trans'] else: - all_cols_update_sql = f'{all_cols_update_sql}, {trans_item["trans"]}' + all_cols_update_sql = ( + f'{all_cols_update_sql}, {trans_item["trans"]}' + ) # If we have some non-conditional transformations then construct and execute a query if all_cols_update_sql: - all_cols_update_sql = f'UPDATE {full_qual_table_name} SET {all_cols_update_sql};' + all_cols_update_sql = ( + f'UPDATE {full_qual_table_name} SET {all_cols_update_sql};' + ) self.query(all_cols_update_sql) diff --git a/pipelinewise/fastsync/commons/target_redshift.py b/pipelinewise/fastsync/commons/target_redshift.py index ff71fb58a..f4190a41b 100644 --- a/pipelinewise/fastsync/commons/target_redshift.py +++ b/pipelinewise/fastsync/commons/target_redshift.py @@ -28,11 +28,18 @@ def __init__(self, connection_config, transformation_config=None): self.transformation_config = transformation_config # Get the required parameters from config file and/or environment variables - aws_profile = self.connection_config.get('aws_profile') or os.environ.get('AWS_PROFILE') - aws_access_key_id = self.connection_config.get('aws_access_key_id') or os.environ.get('AWS_ACCESS_KEY_ID') - aws_secret_access_key = self.connection_config.get('aws_secret_access_key') or \ - os.environ.get('AWS_SECRET_ACCESS_KEY') - aws_session_token = self.connection_config.get('aws_session_token') or os.environ.get('AWS_SESSION_TOKEN') + aws_profile = self.connection_config.get('aws_profile') or os.environ.get( + 'AWS_PROFILE' + ) + aws_access_key_id = self.connection_config.get( + 'aws_access_key_id' + ) or os.environ.get('AWS_ACCESS_KEY_ID') + aws_secret_access_key = self.connection_config.get( + 'aws_secret_access_key' + ) or os.environ.get('AWS_SECRET_ACCESS_KEY') + aws_session_token = self.connection_config.get( + 'aws_session_token' + ) or os.environ.get('AWS_SESSION_TOKEN') # Init S3 client # Conditionally pass keys as this seems to affect whether instance credentials @@ -41,7 +48,7 @@ def __init__(self, connection_config, transformation_config=None): aws_session = boto3.session.Session( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, - aws_session_token=aws_session_token + aws_session_token=aws_session_token, ) credentials = aws_session.get_credentials().get_frozen_credentials() @@ -60,7 +67,7 @@ def open_connection(self): self.connection_config['dbname'], self.connection_config['user'], self.connection_config['password'], - self.connection_config['port'] + self.connection_config['port'], ) return psycopg2.connect(conn_string) @@ -84,7 +91,12 @@ def upload_to_s3(self, file): extra_args = {'ACL': s3_acl} if s3_acl else None - LOGGER.info('Uploading to S3 bucket: %s, local file: %s, S3 key: %s', bucket, file, s3_key) + LOGGER.info( + 'Uploading to S3 bucket: %s, local file: %s, S3 key: %s', + bucket, + file, + s3_key, + ) self.s3.upload_file(file, bucket, s3_key, ExtraArgs=extra_args) @@ -101,25 +113,48 @@ def create_schemas(self, tables): def drop_table(self, target_schema, table_name, is_temporary=False): table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) sql = 'DROP TABLE IF EXISTS {}."{}"'.format(target_schema, target_table.upper()) self.query(sql) - def create_table(self, target_schema: str, table_name: str, columns: List[str], primary_key: List[str], - is_temporary: bool = False, sort_columns=False): + def create_table( + self, + target_schema: str, + table_name: str, + columns: List[str], + primary_key: List[str], + is_temporary: bool = False, + sort_columns=False, + ): table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) # skip the EXTRACTED, BATCHED and DELETED columns in case they exist because they gonna be added later - columns = [c for c in columns if not (c.startswith(self.EXTRACTED_AT_COLUMN) or - c.startswith(self.BATCHED_AT_COLUMN) or - c.startswith(self.DELETED_AT_COLUMN))] + columns = [ + c + for c in columns + if not ( + c.startswith(self.EXTRACTED_AT_COLUMN) + or c.startswith(self.BATCHED_AT_COLUMN) + or c.startswith(self.DELETED_AT_COLUMN) + ) + ] - columns += [f'{self.EXTRACTED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', - f'{self.BATCHED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', - f'{self.DELETED_AT_COLUMN} CHARACTER VARYING'] + columns += [ + f'{self.EXTRACTED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', + f'{self.BATCHED_AT_COLUMN} TIMESTAMP WITHOUT TIME ZONE', + f'{self.DELETED_AT_COLUMN} CHARACTER VARYING', + ] # We need the sort the columns for some taps( for now tap-s3-csv) # because later on when copying a csv file into Snowflake @@ -129,80 +164,121 @@ def create_table(self, target_schema: str, table_name: str, columns: List[str], sql_columns = ','.join(columns) sql_primary_keys = ','.join(primary_key) if primary_key else None - sql = f'CREATE TABLE IF NOT EXISTS {target_schema}."{target_table.upper()}" (' \ - f'{sql_columns}' \ - f'{f", PRIMARY KEY ({sql_primary_keys}))" if primary_key else ")"}' + sql = ( + f'CREATE TABLE IF NOT EXISTS {target_schema}."{target_table.upper()}" (' + f'{sql_columns}' + f'{f", PRIMARY KEY ({sql_primary_keys}))" if primary_key else ")"}' + ) self.query(sql) - def copy_to_table(self, s3_key, target_schema, table_name, size_bytes, is_temporary, skip_csv_header=False): + def copy_to_table( + self, + s3_key, + target_schema, + table_name, + size_bytes, + is_temporary, + skip_csv_header=False, + ): LOGGER.info('Loading %s into Redshift...', s3_key) table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) inserts = 0 bucket = self.connection_config['s3_bucket'] # Step 1: Generate copy credentials - prefer role if provided, otherwise use access and secret keys - copy_credentials = """ + copy_credentials = ( + """ iam_role '{aws_role_arn}' - """.format(aws_role_arn=self.connection_config['aws_redshift_copy_role_arn']) \ - if self.connection_config.get('aws_redshift_copy_role_arn') else """ + """.format( + aws_role_arn=self.connection_config['aws_redshift_copy_role_arn'] + ) + if self.connection_config.get('aws_redshift_copy_role_arn') + else """ ACCESS_KEY_ID '{aws_access_key_id}' SECRET_ACCESS_KEY '{aws_secret_access_key}' {aws_session_token} """.format( - aws_access_key_id=self.connection_config['aws_access_key_id'], - aws_secret_access_key=self.connection_config['aws_secret_access_key'], - aws_session_token="SESSION_TOKEN '{}'".format(self.connection_config['aws_session_token']) \ - if self.connection_config.get('aws_session_token') else '', + aws_access_key_id=self.connection_config['aws_access_key_id'], + aws_secret_access_key=self.connection_config['aws_secret_access_key'], + aws_session_token="SESSION_TOKEN '{}'".format( + self.connection_config['aws_session_token'] + ) + if self.connection_config.get('aws_session_token') + else '', + ) ) # Step 2: Generate copy options - Override defaults from config.json if defined - copy_options = self.connection_config.get('copy_options', f""" + copy_options = self.connection_config.get( + 'copy_options', + f""" EMPTYASNULL BLANKSASNULL TRIMBLANKS TRUNCATECOLUMNS IGNOREHEADER {int(skip_csv_header)} TIMEFORMAT 'auto' - """) + """, + ) # Step3: Using the built-in CSV COPY option to load - copy_sql = f'COPY {target_schema}."{target_table.upper()}" FROM \'s3://{bucket}/{s3_key}\'' \ - f'{copy_credentials}' \ - f'{copy_options}' \ - f'CSV GZIP' + copy_sql = ( + f'COPY {target_schema}."{target_table.upper()}" FROM \'s3://{bucket}/{s3_key}\'' + f'{copy_credentials}' + f'{copy_options}' + f'CSV GZIP' + ) # Get number of inserted records - COPY does insert only results = self.query(copy_sql) if len(results) > 0: inserts = results[0].get('rows_loaded', 0) - LOGGER.info('Loading into %s."%s": %s', - target_schema, - target_table.upper(), - json.dumps({'inserts': inserts, 'updates': 0, 'size_bytes': size_bytes})) + LOGGER.info( + 'Loading into %s."%s": %s', + target_schema, + target_table.upper(), + json.dumps({'inserts': inserts, 'updates': 0, 'size_bytes': size_bytes}), + ) LOGGER.info('Deleting %s from S3...', s3_key) self.s3.delete_object(Bucket=bucket, Key=s3_key) - def grant_select_on_table(self, target_schema, table_name, grantee, is_temporary, to_group=False): + def grant_select_on_table( + self, target_schema, table_name, grantee, is_temporary, to_group=False + ): # Grant role is not mandatory parameter, do nothing if not specified if grantee: table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') - sql = 'GRANT SELECT ON {}."{}" TO {} {}'.format(target_schema, - target_table.upper(), 'GROUP' if to_group else '', - grantee) + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) + sql = 'GRANT SELECT ON {}."{}" TO {} {}'.format( + target_schema, + target_table.upper(), + 'GROUP' if to_group else '', + grantee, + ) self.query(sql) def grant_usage_on_schema(self, target_schema, grantee, to_group=False): # Grant role is not mandatory parameter, do nothing if not specified if grantee: - sql = 'GRANT USAGE ON SCHEMA {} TO {} {}'.format(target_schema, 'GROUP' if to_group else '', grantee) + sql = 'GRANT USAGE ON SCHEMA {} TO {} {}'.format( + target_schema, 'GROUP' if to_group else '', grantee + ) self.query(sql) def grant_select_on_schema(self, target_schema, grantee, to_group=False): # Grant role is not mandatory parameter, do nothing if not specified if grantee: - sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO {} {}'.format(target_schema, 'GROUP' if to_group else '', - grantee) + sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO {} {}'.format( + target_schema, 'GROUP' if to_group else '', grantee + ) self.query(sql) # pylint: disable=duplicate-string-formatting-argument @@ -220,7 +296,9 @@ def obfuscate_columns(self, target_schema, table_name): # # We need to convert to the same format to find the transformation # has that has to be applied - tap_stream_name_by_table_name = '{}-{}'.format(table_dict.get('schema_name'), table_dict.get('table_name')) + tap_stream_name_by_table_name = '{}-{}'.format( + table_dict.get('schema_name'), table_dict.get('table_name') + ) if trans.get('tap_stream_name') == tap_stream_name_by_table_name: column = trans.get('field_id') transform_type = trans.get('type') @@ -230,10 +308,17 @@ def obfuscate_columns(self, target_schema, table_name): trans_cols.append('"{}" = FUNC_SHA1("{}")'.format(column, column)) elif 'HASH-SKIP-FIRST' in transform_type: skip_first_n = transform_type[-1] - trans_cols.append('"{}" = CONCAT(SUBSTRING("{}", 1, {}), FUNC_SHA1(SUBSTRING("{}", {} + 1)))' - .format(column, column, skip_first_n, column, skip_first_n)) + trans_cols.append( + '"{}" = CONCAT(SUBSTRING("{}", 1, {}), FUNC_SHA1(SUBSTRING("{}", {} + 1)))'.format( + column, column, skip_first_n, column, skip_first_n + ) + ) elif transform_type == 'MASK-DATE': - trans_cols.append('"{}" = TO_CHAR("{}"::DATE, \'YYYY-01-01\')::DATE'.format(column, column)) + trans_cols.append( + '"{}" = TO_CHAR("{}"::DATE, \'YYYY-01-01\')::DATE'.format( + column, column + ) + ) elif transform_type == 'MASK-NUMBER': trans_cols.append('"{}" = 0'.format(column)) @@ -252,4 +337,8 @@ def swap_tables(self, schema, table_name): # Swap tables and drop the temp tamp self.query('DROP TABLE IF EXISTS {}."{}"'.format(schema, target_table.upper())) - self.query('ALTER TABLE {}."{}" RENAME TO "{}"'.format(schema, temp_table.upper(), target_table.upper())) + self.query( + 'ALTER TABLE {}."{}" RENAME TO "{}"'.format( + schema, temp_table.upper(), target_table.upper() + ) + ) diff --git a/pipelinewise/fastsync/commons/target_snowflake.py b/pipelinewise/fastsync/commons/target_snowflake.py index f5ee65d3c..d1f495087 100644 --- a/pipelinewise/fastsync/commons/target_snowflake.py +++ b/pipelinewise/fastsync/commons/target_snowflake.py @@ -6,8 +6,7 @@ from typing import List, Dict from snowflake.connector.encryption_util import SnowflakeEncryptionUtil -from snowflake.connector.remote_storage_util import \ - SnowflakeFileEncryptionMaterial +from snowflake.connector.remote_storage_util import SnowflakeFileEncryptionMaterial from . import utils from .transform_utils import TransformationHelper, SQLFlavor @@ -30,27 +29,36 @@ def __init__(self, connection_config, transformation_config=None): self.transformation_config = transformation_config # Get the required parameters from config file and/or environment variables - aws_profile = self.connection_config.get('aws_profile') or os.environ.get('AWS_PROFILE') - aws_access_key_id = self.connection_config.get('aws_access_key_id') or os.environ.get('AWS_ACCESS_KEY_ID') - aws_secret_access_key = self.connection_config.get('aws_secret_access_key') or \ - os.environ.get('AWS_SECRET_ACCESS_KEY') - aws_session_token = self.connection_config.get('aws_session_token') or os.environ.get('AWS_SESSION_TOKEN') + aws_profile = self.connection_config.get('aws_profile') or os.environ.get( + 'AWS_PROFILE' + ) + aws_access_key_id = self.connection_config.get( + 'aws_access_key_id' + ) or os.environ.get('AWS_ACCESS_KEY_ID') + aws_secret_access_key = self.connection_config.get( + 'aws_secret_access_key' + ) or os.environ.get('AWS_SECRET_ACCESS_KEY') + aws_session_token = self.connection_config.get( + 'aws_session_token' + ) or os.environ.get('AWS_SESSION_TOKEN') # AWS credentials based authentication if aws_access_key_id and aws_secret_access_key: aws_session = boto3.session.Session( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, - aws_session_token=aws_session_token + aws_session_token=aws_session_token, ) # AWS Profile based authentication else: aws_session = boto3.session.Session(profile_name=aws_profile) # Create the s3 client - self.s3 = aws_session.client('s3', - region_name=self.connection_config.get('s3_region_name'), - endpoint_url=self.connection_config.get('s3_endpoint_url')) + self.s3 = aws_session.client( + 's3', + region_name=self.connection_config.get('s3_region_name'), + endpoint_url=self.connection_config.get('s3_endpoint_url'), + ) def create_query_tag(self, query_tag_props: dict = None) -> str: schema = None @@ -60,24 +68,30 @@ def create_query_tag(self, query_tag_props: dict = None) -> str: schema = query_tag_props.get('schema') table = query_tag_props.get('table') - return json.dumps({'ppw_component': 'fastsync', - 'tap_id': self.connection_config.get('tap_id'), - 'database': self.connection_config['dbname'], - 'schema': schema, - 'table': table}) + return json.dumps( + { + 'ppw_component': 'fastsync', + 'tap_id': self.connection_config.get('tap_id'), + 'database': self.connection_config['dbname'], + 'schema': schema, + 'table': table, + } + ) def open_connection(self, query_tag_props=None): - return snowflake.connector.connect(user=self.connection_config['user'], - password=self.connection_config['password'], - account=self.connection_config['account'], - database=self.connection_config['dbname'], - warehouse=self.connection_config['warehouse'], - autocommit=True, - session_parameters={ - # Quoted identifiers should be case sensitive - 'QUOTED_IDENTIFIERS_IGNORE_CASE': 'FALSE', - 'QUERY_TAG': self.create_query_tag(query_tag_props) - }) + return snowflake.connector.connect( + user=self.connection_config['user'], + password=self.connection_config['password'], + account=self.connection_config['account'], + database=self.connection_config['dbname'], + warehouse=self.connection_config['warehouse'], + autocommit=True, + session_parameters={ + # Quoted identifiers should be case sensitive + 'QUOTED_IDENTIFIERS_IGNORE_CASE': 'FALSE', + 'QUERY_TAG': self.create_query_tag(query_tag_props), + }, + ) def query(self, query, params=None, query_tag_props=None): LOGGER.debug('Running query: %s', query) @@ -96,7 +110,12 @@ def upload_to_s3(self, file, tmp_dir=None): s3_key_prefix = self.connection_config.get('s3_key_prefix', '') s3_key = '{}{}'.format(s3_key_prefix, os.path.basename(file)) - LOGGER.info('Uploading to S3 bucket: %s, local file: %s, S3 key: %s', bucket, file, s3_key) + LOGGER.info( + 'Uploading to S3 bucket: %s, local file: %s, S3 key: %s', + bucket, + file, + s3_key, + ) # Encrypt csv if client side encryption enabled master_key = self.connection_config.get('client_side_encryption_master_key', '') @@ -104,23 +123,19 @@ def upload_to_s3(self, file, tmp_dir=None): # Encrypt the file LOGGER.info('Encrypting file %s...', file) encryption_material = SnowflakeFileEncryptionMaterial( - query_stage_master_key=master_key, - query_id='', - smk_id=0 + query_stage_master_key=master_key, query_id='', smk_id=0 ) encryption_metadata, encrypted_file = SnowflakeEncryptionUtil.encrypt_file( - encryption_material, - file, - tmp_dir=tmp_dir + encryption_material, file, tmp_dir=tmp_dir ) # Upload to s3 - extra_args = {'ACL': s3_acl} if s3_acl else dict() + extra_args = {'ACL': s3_acl} if s3_acl else {} # Send key and iv in the metadata, that will be required to decrypt and upload the encrypted file extra_args['Metadata'] = { 'x-amz-key': encryption_metadata.key, - 'x-amz-iv': encryption_metadata.iv + 'x-amz-iv': encryption_metadata.iv, } self.s3.upload_file(encrypted_file, bucket, s3_key, ExtraArgs=extra_args) @@ -144,28 +159,43 @@ def copy_to_archive(self, source_s3_key, tap_id, table): archive_file_basename = os.path.basename(source_s3_key) # Get archive s3 prefix from config, defaulting to 'archive' if not specified - archive_s3_prefix = self.connection_config.get('archive_load_files_s3_prefix', 'archive') + archive_s3_prefix = self.connection_config.get( + 'archive_load_files_s3_prefix', 'archive' + ) source_s3_bucket = self.connection_config.get('s3_bucket') # Combine existing metadata with archive related headers - metadata = self.s3.head_object(Bucket=source_s3_bucket, Key=source_s3_key).get('Metadata', {}) - metadata.update({ - 'tap': tap_id, - 'schema': archive_schema, - 'table': archive_table, - 'archived-by': 'pipelinewise_fastsync_postgres_to_snowflake' - }) + metadata = self.s3.head_object(Bucket=source_s3_bucket, Key=source_s3_key).get( + 'Metadata', {} + ) + metadata.update( + { + 'tap': tap_id, + 'schema': archive_schema, + 'table': archive_table, + 'archived-by': 'pipelinewise_fastsync_postgres_to_snowflake', + } + ) # Get archive s3 bucket from config, defaulting to same bucket used for Snowflake imports if not specified - archive_s3_bucket = self.connection_config.get('archive_load_files_s3_bucket', source_s3_bucket) + archive_s3_bucket = self.connection_config.get( + 'archive_load_files_s3_bucket', source_s3_bucket + ) - archive_key = '{}/{}/{}/{}'.format(archive_s3_prefix, tap_id, archive_table, archive_file_basename) + archive_key = '{}/{}/{}/{}'.format( + archive_s3_prefix, tap_id, archive_table, archive_file_basename + ) copy_source = '{}/{}'.format(source_s3_bucket, source_s3_key) LOGGER.info('Archiving %s to %s', copy_source, archive_key) - self.s3.copy_object(CopySource=copy_source, Bucket=archive_s3_bucket, Key=archive_key, - Metadata=metadata, MetadataDirective='REPLACE') + self.s3.copy_object( + CopySource=copy_source, + Bucket=archive_s3_bucket, + Key=archive_key, + Metadata=metadata, + MetadataDirective='REPLACE', + ) def create_schema(self, schema): sql = 'CREATE SCHEMA IF NOT EXISTS {}'.format(schema) @@ -173,25 +203,48 @@ def create_schema(self, schema): def drop_table(self, target_schema, table_name, is_temporary=False): table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) sql = 'DROP TABLE IF EXISTS {}."{}"'.format(target_schema, target_table.upper()) self.query(sql, query_tag_props={'schema': target_schema, 'table': table_name}) - def create_table(self, target_schema: str, table_name: str, columns: List[str], primary_key: List[str], - is_temporary: bool = False, sort_columns=False): + def create_table( + self, + target_schema: str, + table_name: str, + columns: List[str], + primary_key: List[str], + is_temporary: bool = False, + sort_columns=False, + ): table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) # skip the EXTRACTED, BATCHED and DELETED columns in case they exist because they gonna be added later - columns = [c for c in columns if not (c.startswith(utils.SDC_EXTRACTED_AT) or - c.startswith(utils.SDC_BATCHED_AT) or - c.startswith(utils.SDC_DELETED_AT))] + columns = [ + c + for c in columns + if not ( + c.startswith(utils.SDC_EXTRACTED_AT) + or c.startswith(utils.SDC_BATCHED_AT) + or c.startswith(utils.SDC_DELETED_AT) + ) + ] - columns += [f'{utils.SDC_EXTRACTED_AT} TIMESTAMP_NTZ', - f'{utils.SDC_BATCHED_AT} TIMESTAMP_NTZ', - f'{utils.SDC_DELETED_AT} VARCHAR'] + columns += [ + f'{utils.SDC_EXTRACTED_AT} TIMESTAMP_NTZ', + f'{utils.SDC_BATCHED_AT} TIMESTAMP_NTZ', + f'{utils.SDC_DELETED_AT} VARCHAR', + ] # We need the sort the columns for some taps( for now tap-s3-csv) # because later on when copying a csv file into Snowflake @@ -201,49 +254,85 @@ def create_table(self, target_schema: str, table_name: str, columns: List[str], sql_columns = ','.join(columns) sql_primary_keys = ','.join(primary_key) if primary_key else None - sql = f'CREATE OR REPLACE TABLE {target_schema}."{target_table.upper()}" (' \ - f'{sql_columns}' \ - f'{f", PRIMARY KEY ({sql_primary_keys}))" if primary_key else ")"}' + sql = ( + f'CREATE OR REPLACE TABLE {target_schema}."{target_table.upper()}" (' + f'{sql_columns}' + f'{f", PRIMARY KEY ({sql_primary_keys}))" if primary_key else ")"}' + ) - self.query(sql, query_tag_props={'schema': target_schema, 'table': target_table}) + self.query( + sql, query_tag_props={'schema': target_schema, 'table': target_table} + ) # pylint: disable=too-many-locals - def copy_to_table(self, s3_key, target_schema, table_name, size_bytes, is_temporary, skip_csv_header=False): + def copy_to_table( + self, + s3_key, + target_schema, + table_name, + size_bytes, + is_temporary, + skip_csv_header=False, + ): LOGGER.info('Loading %s into Snowflake...', s3_key) table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) inserts = 0 stage = self.connection_config['stage'] - sql = f'COPY INTO {target_schema}."{target_table.upper()}" FROM \'@{stage}/{s3_key}\'' \ - f' FILE_FORMAT = (type=CSV escape=\'\\x1e\' escape_unenclosed_field=\'\\x1e\'' \ - f' field_optionally_enclosed_by=\'\"\' skip_header={int(skip_csv_header)}' \ - f' compression=GZIP binary_format=HEX)' + sql = ( + f'COPY INTO {target_schema}."{target_table.upper()}" FROM \'@{stage}/{s3_key}\'' + f' FILE_FORMAT = (type=CSV escape=\'\\x1e\' escape_unenclosed_field=\'\\x1e\'' + f' field_optionally_enclosed_by=\'\"\' skip_header={int(skip_csv_header)}' + f' compression=GZIP binary_format=HEX)' + ) # Get number of inserted records - COPY does insert only - results = self.query(sql, query_tag_props={'schema': target_schema, 'table': target_table}) + results = self.query( + sql, query_tag_props={'schema': target_schema, 'table': target_table} + ) if len(results) > 0: inserts = sum([file_part.get('rows_loaded', 0) for file_part in results]) - LOGGER.info('Loading into %s."%s": %s', - target_schema, - target_table.upper(), - json.dumps({'inserts': inserts, - 'updates': 0, - 'file_parts': len(results), - 'size_bytes': size_bytes})) + LOGGER.info( + 'Loading into %s."%s": %s', + target_schema, + target_table.upper(), + json.dumps( + { + 'inserts': inserts, + 'updates': 0, + 'file_parts': len(results), + 'size_bytes': size_bytes, + } + ), + ) # grant_... functions are common functions called by utils.py: grant_privilege function # "to_group" is not used here but exists for compatibility reasons with other database types # "to_group" is for databases that can grant to users and groups separately like Amazon Redshift # pylint: disable=unused-argument - def grant_select_on_table(self, target_schema, table_name, role, is_temporary, to_group=False): + def grant_select_on_table( + self, target_schema, table_name, role, is_temporary, to_group=False + ): # Grant role is not mandatory parameter, do nothing if not specified if role: table_dict = utils.tablename_to_dict(table_name) - target_table = table_dict.get('table_name') if not is_temporary else table_dict.get('temp_table_name') - sql = 'GRANT SELECT ON {}."{}" TO ROLE {}'.format(target_schema, target_table.upper(), role) - self.query(sql, query_tag_props={'schema': target_schema, 'table': table_name}) + target_table = ( + table_dict.get('table_name') + if not is_temporary + else table_dict.get('temp_table_name') + ) + sql = 'GRANT SELECT ON {}."{}" TO ROLE {}'.format( + target_schema, target_table.upper(), role + ) + self.query( + sql, query_tag_props={'schema': target_schema, 'table': table_name} + ) # pylint: disable=unused-argument def grant_usage_on_schema(self, target_schema, role, to_group=False): @@ -256,7 +345,9 @@ def grant_usage_on_schema(self, target_schema, role, to_group=False): def grant_select_on_schema(self, target_schema, role, to_group=False): # Grant role is not mandatory parameter, do nothing if not specified if role: - sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO ROLE {}'.format(target_schema, role) + sql = 'GRANT SELECT ON ALL TABLES IN SCHEMA {} TO ROLE {}'.format( + target_schema, role + ) self.query(sql, query_tag_props={'schema': target_schema}) def obfuscate_columns(self, target_schema: str, table_name: str): @@ -277,15 +368,17 @@ def obfuscate_columns(self, target_schema: str, table_name: str): # # We need to convert to the same format to find the transformation # has that has to be applied - tap_stream_name_by_table_name = '{}-{}'.format(table_dict['schema_name'], table_dict['table_name']) \ - if table_dict['schema_name'] is not None else table_dict['table_name'] + tap_stream_name_by_table_name = ( + '{}-{}'.format(table_dict['schema_name'], table_dict['table_name']) + if table_dict['schema_name'] is not None + else table_dict['table_name'] + ) # Find obfuscation rules for the current table # trans_map = self.__get_stream_transformation_map(tap_stream_name_by_table_name, transformations) trans_map = TransformationHelper.get_trans_in_sql_flavor( - tap_stream_name_by_table_name, - transformations, - SQLFlavor('snowflake')) + tap_stream_name_by_table_name, transformations, SQLFlavor('snowflake') + ) self.__apply_transformations(trans_map, target_schema, temp_table) @@ -304,13 +397,19 @@ def swap_tables(self, schema, table_name) -> None: temp_table = table_dict.get('temp_table_name') # Swap tables and drop the temp tamp - self.query(f'ALTER TABLE {schema}."{temp_table.upper()}" SWAP WITH {schema}."{target_table.upper()}"', - query_tag_props={'schema': schema, 'table': target_table}) - - self.query(f'DROP TABLE IF EXISTS {schema}."{temp_table.upper()}"', - query_tag_props={'schema': schema, 'table': temp_table}) - - def __apply_transformations(self, transformations: List[Dict], target_schema: str, table_name: str) -> None: + self.query( + f'ALTER TABLE {schema}."{temp_table.upper()}" SWAP WITH {schema}."{target_table.upper()}"', + query_tag_props={'schema': schema, 'table': target_table}, + ) + + self.query( + f'DROP TABLE IF EXISTS {schema}."{temp_table.upper()}"', + query_tag_props={'schema': schema, 'table': temp_table}, + ) + + def __apply_transformations( + self, transformations: List[Dict], target_schema: str, table_name: str + ) -> None: """ Generate and execute the SQL queries based on the given transformations. Args: @@ -330,10 +429,15 @@ def __apply_transformations(self, transformations: List[Dict], target_schema: st # If we have conditions, then we need to construct the query and execute it to transform the # single column conditionally if trans_item['conditions']: - sql = f'UPDATE {full_qual_table_name} ' \ - f'SET {trans_item["trans"]} WHERE {trans_item["conditions"]};' + sql = ( + f'UPDATE {full_qual_table_name} ' + f'SET {trans_item["trans"]} WHERE {trans_item["conditions"]};' + ) - self.query(sql, query_tag_props={'schema': target_schema, 'table': table_name}) + self.query( + sql, + query_tag_props={'schema': target_schema, 'table': table_name}, + ) # Otherwise, we can add this column to a general UPDATE query with no predicates else: @@ -343,10 +447,17 @@ def __apply_transformations(self, transformations: List[Dict], target_schema: st if not all_cols_update_sql: all_cols_update_sql = trans_item['trans'] else: - all_cols_update_sql = f'{all_cols_update_sql}, {trans_item["trans"]}' + all_cols_update_sql = ( + f'{all_cols_update_sql}, {trans_item["trans"]}' + ) # If we have some non-conditional transformations then construct and execute a query if all_cols_update_sql: - all_cols_update_sql = f'UPDATE {full_qual_table_name} SET {all_cols_update_sql};' - - self.query(all_cols_update_sql, query_tag_props={'schema': target_schema, 'table': table_name}) + all_cols_update_sql = ( + f'UPDATE {full_qual_table_name} SET {all_cols_update_sql};' + ) + + self.query( + all_cols_update_sql, + query_tag_props={'schema': target_schema, 'table': table_name}, + ) diff --git a/pipelinewise/fastsync/commons/transform_utils.py b/pipelinewise/fastsync/commons/transform_utils.py index 164af4b35..d0cbd1950 100644 --- a/pipelinewise/fastsync/commons/transform_utils.py +++ b/pipelinewise/fastsync/commons/transform_utils.py @@ -7,6 +7,7 @@ class TransformationType(Enum): """ List of supported transformation types """ + SET_NULL = 'SET-NULL' MASK_HIDDEN = 'MASK-HIDDEN' MASK_DATE = 'MASK-DATE' @@ -21,6 +22,15 @@ class TransformationType(Enum): HASH_SKIP_FIRST_7 = 'HASH-SKIP-FIRST-7' HASH_SKIP_FIRST_8 = 'HASH-SKIP-FIRST-8' HASH_SKIP_FIRST_9 = 'HASH-SKIP-FIRST-9' + MASK_STRING_SKIP_ENDS_1 = 'MASK-STRING-SKIP-ENDS-1' + MASK_STRING_SKIP_ENDS_2 = 'MASK-STRING-SKIP-ENDS-2' + MASK_STRING_SKIP_ENDS_3 = 'MASK-STRING-SKIP-ENDS-3' + MASK_STRING_SKIP_ENDS_4 = 'MASK-STRING-SKIP-ENDS-4' + MASK_STRING_SKIP_ENDS_5 = 'MASK-STRING-SKIP-ENDS-5' + MASK_STRING_SKIP_ENDS_6 = 'MASK-STRING-SKIP-ENDS-6' + MASK_STRING_SKIP_ENDS_7 = 'MASK-STRING-SKIP-ENDS-7' + MASK_STRING_SKIP_ENDS_8 = 'MASK-STRING-SKIP-ENDS-8' + MASK_STRING_SKIP_ENDS_9 = 'MASK-STRING-SKIP-ENDS-9' @unique @@ -28,6 +38,7 @@ class SQLFlavor(Enum): """ List of supported sql flavors """ + SNOWFLAKE = 'snowflake' POSTGRES = 'postgres' BIGQUERY = 'bigquery' @@ -41,10 +52,8 @@ class TransformationHelper: @classmethod def get_trans_in_sql_flavor( - cls, - stream_name: str, - transformations: List[Dict], - sql_flavor: SQLFlavor) -> List[Dict]: + cls, stream_name: str, transformations: List[Dict], sql_flavor: SQLFlavor + ) -> List[Dict]: """ Find the transformations to apply to the given stream and does proper formatting and mapping @@ -78,51 +87,67 @@ def get_trans_in_sql_flavor( conditions = cls.__conditions_to_sql(transform_conditions, sql_flavor) if transform_type == TransformationType.SET_NULL: - trans_map.append({ - 'trans': f'{column} = NULL', - 'conditions': conditions - }) + trans_map.append( + {'trans': f'{column} = NULL', 'conditions': conditions} + ) elif transform_type == TransformationType.HASH: - trans_map.append({ - 'trans': cls.__hash_to_sql(column, sql_flavor), - 'conditions': conditions - }) + trans_map.append( + { + 'trans': cls.__hash_to_sql(column, sql_flavor), + 'conditions': conditions, + } + ) elif transform_type.value.startswith('HASH-SKIP-FIRST-'): - trans_map.append({ - 'trans': cls.__hash_skip_first_to_sql(transform_type, column, sql_flavor), - 'conditions': conditions - }) + trans_map.append( + { + 'trans': cls.__hash_skip_first_to_sql( + transform_type, column, sql_flavor + ), + 'conditions': conditions, + } + ) elif transform_type == TransformationType.MASK_DATE: - trans_map.append({ - 'trans': cls.__mask_date_to_sql(column, sql_flavor), - 'conditions': conditions - }) + trans_map.append( + { + 'trans': cls.__mask_date_to_sql(column, sql_flavor), + 'conditions': conditions, + } + ) elif transform_type == TransformationType.MASK_NUMBER: - trans_map.append({ - 'trans': f'{column} = 0', - 'conditions': conditions - }) + trans_map.append( + {'trans': f'{column} = 0', 'conditions': conditions} + ) + + elif transform_type.value.startswith('MASK-STRING-SKIP-ENDS-'): + + trans_map.append( + { + 'trans': cls.__mask_string_skip_ends_to_sql( + transform_type, column, sql_flavor + ), + 'conditions': conditions, + } + ) elif transform_type == TransformationType.MASK_HIDDEN: - trans_map.append({ - 'trans': f"{column} = 'hidden'", - 'conditions': conditions - }) + trans_map.append( + {'trans': f"{column} = 'hidden'", 'conditions': conditions} + ) return trans_map @classmethod + # pylint: disable=W0238 # False positive when it is used by another classmethod def __conditions_to_sql( - cls, - transform_conditions: List[Dict], - sql_flavor: SQLFlavor) -> Optional[str]: + cls, transform_conditions: List[Dict], sql_flavor: SQLFlavor + ) -> Optional[str]: """ Convert the conditional transformations into equivalent form in SF SQL. Args: @@ -159,7 +184,11 @@ def __conditions_to_sql( else: operator = '=' - value = f"'{condition['equals']}'" if isinstance(condition['equals'], str) else condition['equals'] + value = ( + f"'{condition['equals']}'" + if isinstance(condition['equals'], str) + else condition['equals'] + ) elif 'regex_match' in condition: @@ -172,21 +201,28 @@ def __conditions_to_sql( operator = '~' elif sql_flavor == SQLFlavor.BIGQUERY: - conditions.append(f"REGEXP_CONTAINS({cls.__safe_column(condition['column'], sql_flavor)}, {value})") + conditions.append( + f"REGEXP_CONTAINS({cls.__safe_column(condition['column'], sql_flavor)}, {value})" + ) continue else: - raise NotImplementedError(f'regex_match conditional transformation in {sql_flavor.value} SQL ' - f'flavor not implemented!') + raise NotImplementedError( + f'regex_match conditional transformation in {sql_flavor.value} SQL ' + f'flavor not implemented!' + ) else: continue - conditions.append(f"({cls.__safe_column(condition['column'], sql_flavor)} {operator} {value})") + conditions.append( + f"({cls.__safe_column(condition['column'], sql_flavor)} {operator} {value})" + ) return ' AND '.join(conditions) @classmethod + # pylint: disable=W0238 # False positive when it is used by another classmethod def __safe_column(cls, col: str, sql_flavor: SQLFlavor): # Make the field id safe in case it's a reserved word if sql_flavor == SQLFlavor.SNOWFLAKE: @@ -204,6 +240,7 @@ def __safe_column(cls, col: str, sql_flavor: SQLFlavor): return column @classmethod + # pylint: disable=W0238 # False positive when it is used by another classmethod def __hash_to_sql(cls, column: str, sql_flavor: SQLFlavor) -> str: """ convert HASH transformation into the right sql string @@ -226,12 +263,16 @@ def __hash_to_sql(cls, column: str, sql_flavor: SQLFlavor) -> str: else: raise NotImplementedError( - f'HASH transformation in {sql_flavor.value} SQL flavor not implemented!') + f'HASH transformation in {sql_flavor.value} SQL flavor not implemented!' + ) return trans @classmethod - def __hash_skip_first_to_sql(cls, transform_type: TransformationType, column: str, sql_flavor: SQLFlavor) -> str: + # pylint: disable=W0238 # False positive when it is used by another classmethod + def __hash_skip_first_to_sql( + cls, transform_type: TransformationType, column: str, sql_flavor: SQLFlavor + ) -> str: """ convert HASH-SKIP-FIRST-n transformation into the right sql string Args: @@ -247,20 +288,27 @@ def __hash_skip_first_to_sql(cls, transform_type: TransformationType, column: st if sql_flavor == SQLFlavor.SNOWFLAKE: trans = '{0} = CONCAT(SUBSTRING({0}, 1, {1}), SHA2(SUBSTRING({0}, {1} + 1), 256))'.format( - column, skip_first_n) + column, skip_first_n + ) elif sql_flavor == SQLFlavor.POSTGRES: - trans = '{0} = CONCAT(SUBSTRING({0}, 1, {1}), ENCODE(DIGEST(SUBSTRING({0}, {1} + 1), ' \ - '\'sha256\'), \'hex\'))'.format(column, skip_first_n) + trans = ( + '{0} = CONCAT(SUBSTRING({0}, 1, {1}), ENCODE(DIGEST(SUBSTRING({0}, {1} + 1), ' + '\'sha256\'), \'hex\'))'.format(column, skip_first_n) + ) elif sql_flavor == SQLFlavor.BIGQUERY: trans = '{0} = CONCAT(SUBSTRING({0}, 1, {1}), TO_BASE64(SHA256(SUBSTRING({0}, {1} + 1))))'.format( - column, skip_first_n) + column, skip_first_n + ) else: - raise NotImplementedError(f'HASH-SKIP-FIRST-{skip_first_n} transformation in {sql_flavor.value} SQL flavor ' - f'not implemented!') + raise NotImplementedError( + f'HASH-SKIP-FIRST-{skip_first_n} transformation in {sql_flavor.value} SQL flavor ' + f'not implemented!' + ) return trans @classmethod + # pylint: disable=W0238 # False positive when it is used by another classmethod def __mask_date_to_sql(cls, column: str, sql_flavor: SQLFlavor) -> str: """ convert MASK-DATE transformation into the right sql string @@ -273,24 +321,70 @@ def __mask_date_to_sql(cls, column: str, sql_flavor: SQLFlavor) -> str: Returns: sql string equivalent of the mask date """ if sql_flavor == SQLFlavor.SNOWFLAKE: - trans = f'{column} = TIMESTAMP_NTZ_FROM_PARTS(' \ - f'DATE_FROM_PARTS(YEAR({column}), 1, 1),' \ - f'TO_TIME({column}))' + trans = ( + f'{column} = TIMESTAMP_NTZ_FROM_PARTS(' + f'DATE_FROM_PARTS(YEAR({column}), 1, 1),' + f'TO_TIME({column}))' + ) elif sql_flavor == SQLFlavor.POSTGRES: - trans = '{0} = MAKE_TIMESTAMP(' \ - 'DATE_PART(\'year\', {0})::int, ' \ - '1, ' \ - '1, ' \ - 'DATE_PART(\'hour\', {0})::int, ' \ - 'DATE_PART(\'minute\', {0})::int, ' \ - 'DATE_PART(\'second\', {0})::double precision)'.format(column) + trans = ( + '{0} = MAKE_TIMESTAMP(' + 'DATE_PART(\'year\', {0})::int, ' + '1, ' + '1, ' + 'DATE_PART(\'hour\', {0})::int, ' + 'DATE_PART(\'minute\', {0})::int, ' + 'DATE_PART(\'second\', {0})::double precision)'.format(column) + ) + elif sql_flavor == SQLFlavor.BIGQUERY: + trans = ( + f'{column} = TIMESTAMP(DATETIME(' + f'DATE(EXTRACT(YEAR FROM {column}), 1, 1),' + f'TIME({column})))' + ) + else: + raise NotImplementedError( + f'MASK-DATE transformation in {sql_flavor.value} SQL flavor ' + f'not implemented!' + ) + + return trans + + @classmethod + # pylint: disable=W0238 # False positive when it is used by another classmethod + def __mask_string_skip_ends_to_sql( + cls, transform_type: TransformationType, column: str, sql_flavor: SQLFlavor + ) -> str: + """ + convert MASK-STRING-SKIP-ENDS-n transformation into the right sql string + Args: + column: column to apply the masking to + sql_flavor: the sql flavor to use + + Raises: NotImplementedError if mask-string-skip-ends is not implemented for the given sql flavor + + Returns: sql string equivalent of the mask-string-skip-ends + """ + skip_ends_n = int(transform_type.value[-1]) + + if sql_flavor == SQLFlavor.SNOWFLAKE: + trans = '{0} = CASE WHEN LENGTH({0}) > 2 * {1} THEN ' \ + 'CONCAT(SUBSTRING({0}, 1, {1}), REPEAT(\'*\', LENGTH({0})-(2 * {1})), ' \ + 'SUBSTRING({0}, LENGTH({0})-{1}+1, {1})) ' \ + 'ELSE REPEAT(\'*\', LENGTH({0})) END'.format(column, skip_ends_n) + elif sql_flavor == SQLFlavor.POSTGRES: + trans = '{0} = CASE WHEN LENGTH({0}) > 2 * {1} THEN ' \ + 'CONCAT(SUBSTRING({0}, 1, {1}), REPEAT(\'*\', LENGTH({0})-(2 * {1})), ' \ + 'SUBSTRING({0}, LENGTH({0})-{1}+1, {1})) ' \ + 'ELSE REPEAT(\'*\', LENGTH({0})) END'.format(column, skip_ends_n) elif sql_flavor == SQLFlavor.BIGQUERY: - trans = f'{column} = TIMESTAMP(DATETIME(' \ - f'DATE(EXTRACT(YEAR FROM {column}), 1, 1),' \ - f'TIME({column})))' + trans = '{0} = CASE WHEN LENGTH({0}) > 2 * {1} THEN ' \ + 'CONCAT(SUBSTRING({0}, 1, {1}), REPEAT(\'*\', LENGTH({0})-(2 * {1})), ' \ + 'SUBSTRING({0}, LENGTH({0})-{1}+1, {1})) ' \ + 'ELSE REPEAT(\'*\', LENGTH({0})) END'.format(column, skip_ends_n) else: - raise NotImplementedError(f'MASK-DATE transformation in {sql_flavor.value} SQL flavor ' + raise NotImplementedError(f'MASK-STRING-SKIP-ENDS transformation in {sql_flavor.value} SQL flavor ' f'not implemented!') return trans diff --git a/pipelinewise/fastsync/commons/utils.py b/pipelinewise/fastsync/commons/utils.py index 20729c66a..761101c97 100644 --- a/pipelinewise/fastsync/commons/utils.py +++ b/pipelinewise/fastsync/commons/utils.py @@ -19,6 +19,7 @@ class NotSelectedTableException(Exception): """ Exception to raise when a table is not selected for resync """ + def __init__(self, table_name, selected_tables): self.message = f'Cannot Resync unselected table "{table_name}"! Selected tables are: {selected_tables}' super().__init__(self, self.message) @@ -35,13 +36,13 @@ def get_cpu_cores(): def load_json(path): - with open(path) as fil: + with open(path, encoding='utf-8') as fil: return json.load(fil) def save_dict_to_json(path, data): LOGGER.info('Saving new state file to %s', path) - with open(path, 'w') as fil: + with open(path, 'w', encoding='utf-8') as fil: fil.write(json.dumps(data)) @@ -70,7 +71,7 @@ def tablename_to_dict(table, separator='.'): 'catalog_name': catalog_name, 'schema_name': schema_name, 'table_name': table_name, - 'temp_table_name': '{}_temp'.format(table_name) + 'temp_table_name': '{}_temp'.format(table_name), } @@ -84,8 +85,14 @@ def get_tables_from_properties(properties: Dict) -> set: metadata = stream.get('metadata', []) table_name = stream.get('table_name', stream['stream']) - table_meta = next((i for i in metadata if isinstance(i, dict) and len(i.get('breadcrumb', [])) == 0), - {}).get('metadata') + table_meta = next( + ( + i + for i in metadata + if isinstance(i, dict) and len(i.get('breadcrumb', [])) == 0 + ), + {}, + ).get('metadata') selected = table_meta.get('selected', False) schema_name = table_meta.get('schema-name') db_name = table_meta.get('database-name') @@ -100,11 +107,7 @@ def get_tables_from_properties(properties: Dict) -> set: return tables -def get_bookmark_for_table( - table, - properties, - db_engine, - dbname=None): +def get_bookmark_for_table(table, properties, db_engine, dbname=None): """Get actual bookmark for a specific table used for LOG_BASED or INCREMENTAL replications """ @@ -116,24 +119,37 @@ def get_bookmark_for_table( table_name = stream.get('table_name', stream['stream']) # Get table specific metadata i.e. replication method, replication key, etc. - table_meta = next((i for i in metadata if isinstance(i, dict) and len(i.get('breadcrumb', [])) == 0), - {}).get('metadata') + table_meta = next( + ( + i + for i in metadata + if isinstance(i, dict) and len(i.get('breadcrumb', [])) == 0 + ), + {}, + ).get('metadata') db_name = table_meta.get('database-name') schema_name = table_meta.get('schema-name') replication_method = table_meta.get('replication-method') replication_key = table_meta.get('replication-key') - fully_qualified_table_name = '{}.{}'.format(schema_name or db_name, table_name) \ - if schema_name is not None or db_name is not None else table_name + fully_qualified_table_name = ( + '{}.{}'.format(schema_name or db_name, table_name) + if schema_name is not None or db_name is not None + else table_name + ) - if (dbname is None or db_name == dbname) and fully_qualified_table_name == table: + if ( + dbname is None or db_name == dbname + ) and fully_qualified_table_name == table: # Log based replication: get mysql binlog position if replication_method == 'LOG_BASED': bookmark = db_engine.fetch_current_log_pos() # Key based incremental replication: Get max replication key from source elif replication_method == 'INCREMENTAL': - bookmark = db_engine.fetch_current_incremental_key_pos(fully_qualified_table_name, replication_key) + bookmark = db_engine.fetch_current_incremental_key_pos( + fully_qualified_table_name, replication_key + ) break @@ -155,7 +171,9 @@ def get_target_schema(target_config, table): } """ target_schema = None - config_default_target_schema = target_config.get('default_target_schema', '').strip() + config_default_target_schema = target_config.get( + 'default_target_schema', '' + ).strip() config_schema_mapping = target_config.get('schema_mapping', {}) table_dict = tablename_to_dict(table) @@ -168,7 +186,8 @@ def get_target_schema(target_config, table): if not target_schema: raise Exception( "Target schema name not defined in config. Neither 'default_target_schema' (string) nor 'schema_mapping' " - '(object) defines target schema for {} stream. '.format(table)) + '(object) defines target schema for {} stream. '.format(table) + ) return target_schema @@ -202,13 +221,17 @@ def get_grantees(target_config, table): } """ grantees = [] - config_default_target_schema_select_permissions = target_config.get('default_target_schema_select_permissions', []) + config_default_target_schema_select_permissions = target_config.get( + 'default_target_schema_select_permissions', [] + ) config_schema_mapping = target_config.get('schema_mapping', {}) table_dict = tablename_to_dict(table) table_schema = table_dict['schema_name'] if config_schema_mapping and table_schema in config_schema_mapping: - grantees = config_schema_mapping[table_schema].get('target_schema_select_permissions', []) + grantees = config_schema_mapping[table_schema].get( + 'target_schema_select_permissions', [] + ) elif config_default_target_schema_select_permissions: grantees = config_default_target_schema_select_permissions @@ -248,9 +271,13 @@ def grant_privilege(schema, grantees, grant_method, to_group=False): def save_state_file(path, table, bookmark, dbname=None): table_dict = tablename_to_dict(table) if dbname: - stream_id = '{}-{}-{}'.format(dbname, table_dict.get('schema_name'), table_dict.get('table_name')) + stream_id = '{}-{}-{}'.format( + dbname, table_dict.get('schema_name'), table_dict.get('table_name') + ) elif table_dict['schema_name']: - stream_id = '{}-{}'.format(table_dict['schema_name'], table_dict.get('table_name')) + stream_id = '{}-{}'.format( + table_dict['schema_name'], table_dict.get('table_name') + ) else: stream_id = table_dict['table_name'] @@ -275,7 +302,6 @@ def save_state_file(path, table, bookmark, dbname=None): save_dict_to_json(path, state) - def parse_args(required_config_keys: Dict) -> argparse.Namespace: """Parse standard command-line args. @@ -299,8 +325,14 @@ def parse_args(required_config_keys: Dict) -> argparse.Namespace: parser.add_argument('--target', help='Target Config file', required=True) parser.add_argument('--transform', help='Transformations Config file') parser.add_argument('--tables', help='Sync only specific tables') - parser.add_argument('--temp_dir', help='Temporary directory required for CSV exports') - parser.add_argument('--drop_pg_slot', help='Drop pg replication slot before starting resync', action='store_true') + parser.add_argument( + '--temp_dir', help='Temporary directory required for CSV exports' + ) + parser.add_argument( + '--drop_pg_slot', + help='Drop pg replication slot before starting resync', + action='store_true', + ) args: argparse.Namespace = parser.parse_args() @@ -348,15 +380,20 @@ def retry_pattern(): import backoff from botocore.exceptions import ClientError - return backoff.on_exception(backoff.expo, - ClientError, - max_tries=5, - on_backoff=log_backoff_attempt, - factor=10) + return backoff.on_exception( + backoff.expo, + ClientError, + max_tries=5, + on_backoff=log_backoff_attempt, + factor=10, + ) def log_backoff_attempt(details): - LOGGER.error('Error detected communicating with Amazon, triggering backoff: %s try', details.get('tries')) + LOGGER.error( + 'Error detected communicating with Amazon, triggering backoff: %s try', + details.get('tries'), + ) def get_pool_size(tap: Dict) -> int: @@ -377,11 +414,9 @@ def get_pool_size(tap: Dict) -> int: return min(fastsync_parallelism, cpu_cores) -def gen_export_filename(tap_id: str, - table: str, - suffix: str = None, - postfix: str = None, - ext: str = None) -> str: +def gen_export_filename( + tap_id: str, table: str, suffix: str = None, postfix: str = None, ext: str = None +) -> str: """ Generates a unique filename used for exported fastsync data that avoids file name collision @@ -407,8 +442,6 @@ def gen_export_filename(tap_id: str, if not ext: ext = 'csv.gz' - return 'pipelinewise_{}_{}_{}_fastsync_{}.{}'.format(tap_id, - table, - suffix, - postfix, - ext) + return 'pipelinewise_{}_{}_{}_fastsync_{}.{}'.format( + tap_id, table, suffix, postfix, ext + ) diff --git a/pipelinewise/fastsync/mongodb_to_bigquery.py b/pipelinewise/fastsync/mongodb_to_bigquery.py index 9fc88b343..c82dd3cbc 100644 --- a/pipelinewise/fastsync/mongodb_to_bigquery.py +++ b/pipelinewise/fastsync/mongodb_to_bigquery.py @@ -25,15 +25,13 @@ 'auth_database', 'dbname', ], - 'target': [ - 'project_id' - ] + 'target': ['project_id'], } LOCK = multiprocessing.Lock() -def tap_type_to_target_type(mongo_type): +def tap_type_to_target_type(mongo_type, *_): """Data type mapping from MongoDB to Bigquery""" return { 'string': 'STRING', @@ -53,7 +51,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: try: dbname = args.tap.get('dbname') - filename = 'pipelinewise_fastsync_{}_{}_{}.csv'.format(dbname, table, time.strftime('%Y%m%d-%H%M%S')) + filename = 'pipelinewise_fastsync_{}_{}_{}.csv'.format( + dbname, table, time.strftime('%Y%m%d-%H%M%S') + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -61,7 +61,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: mongodb.open_connection() # Get bookmark - LSN position or Incremental Key value - bookmark = utils.get_bookmark_for_table(table, args.properties, mongodb, dbname=dbname) + bookmark = utils.get_bookmark_for_table( + table, args.properties, mongodb, dbname=dbname + ) # Exporting table data, get table definitions and close connection to avoid timeouts mongodb.copy_table(table, filepath, args.temp_dir) @@ -75,7 +77,14 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: bigquery.create_table(target_schema, table, bigquery_columns, is_temporary=True) # Load into Bigquery table - bigquery.copy_to_table(filepath, target_schema, table, size_bytes, is_temporary=True, skip_csv_header=True) + bigquery.copy_to_table( + filepath, + target_schema, + table, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) os.remove(filepath) # Obfuscate columns @@ -113,7 +122,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -121,16 +131,25 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -141,8 +160,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/mongodb_to_postgres.py b/pipelinewise/fastsync/mongodb_to_postgres.py index 80003dde3..b8039dbb6 100644 --- a/pipelinewise/fastsync/mongodb_to_postgres.py +++ b/pipelinewise/fastsync/mongodb_to_postgres.py @@ -24,18 +24,13 @@ 'auth_database', 'dbname', ], - 'target': [ - 'host', - 'port', - 'user', - 'password' - ] + 'target': ['host', 'port', 'user', 'password'], } LOCK = multiprocessing.Lock() -def tap_type_to_target_type(mongo_type): +def tap_type_to_target_type(mongo_type, *_): """Data type mapping from MongoDB to Postgres""" return { 'string': 'CHARACTER VARYING', @@ -54,7 +49,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: postgres = FastSyncTargetPostgres(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -62,7 +59,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: mongodb.open_connection() # Get bookmark - token of the most recent ChangeStream for logbased - bookmark = utils.get_bookmark_for_table(table, args.properties, mongodb, dbname=args.tap.get('dbname')) + bookmark = utils.get_bookmark_for_table( + table, args.properties, mongodb, dbname=args.tap.get('dbname') + ) # Exporting table data, get table definitions and close connection to avoid timeouts mongodb.copy_table(table, filepath, args.temp_dir) @@ -74,10 +73,19 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Postgres postgres.drop_table(target_schema, table, is_temporary=True) - postgres.create_table(target_schema, table, postgres_columns, primary_key, is_temporary=True) + postgres.create_table( + target_schema, table, postgres_columns, primary_key, is_temporary=True + ) # Load into Postgres table - postgres.copy_to_table(filepath, target_schema, table, size_bytes, is_temporary=True, skip_csv_header=True) + postgres.copy_to_table( + filepath, + target_schema, + table, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) os.remove(filepath) # Obfuscate columns @@ -114,7 +122,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -122,7 +131,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Create target schemas sequentially, Postgres doesn't like it running in parallel postgres_target = FastSyncTargetPostgres(args.target, args.transform) @@ -131,11 +144,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -146,8 +164,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/mongodb_to_snowflake.py b/pipelinewise/fastsync/mongodb_to_snowflake.py index 2ffe06cc0..9a662cc4c 100644 --- a/pipelinewise/fastsync/mongodb_to_snowflake.py +++ b/pipelinewise/fastsync/mongodb_to_snowflake.py @@ -32,14 +32,14 @@ 'warehouse', 's3_bucket', 'stage', - 'file_format' - ] + 'file_format', + ], } LOCK = multiprocessing.Lock() -def tap_type_to_target_type(mongo_type): +def tap_type_to_target_type(mongo_type, *_): """Data type mapping from MongoDB to Snowflake""" return { 'string': 'TEXT', @@ -69,7 +69,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: mongodb.open_connection() # Get bookmark - LSN position or Incremental Key value - bookmark = utils.get_bookmark_for_table(table, args.properties, mongodb, dbname=dbname) + bookmark = utils.get_bookmark_for_table( + table, args.properties, mongodb, dbname=dbname + ) # Exporting table data, get table definitions and close connection to avoid timeouts mongodb.copy_table(table, filepath, args.temp_dir) @@ -85,10 +87,19 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Snowflake snowflake.create_schema(target_schema) - snowflake.create_table(target_schema, table, snowflake_columns, primary_key, is_temporary=True) + snowflake.create_table( + target_schema, table, snowflake_columns, primary_key, is_temporary=True + ) # Load into Snowflake table - snowflake.copy_to_table(s3_key, target_schema, table, size_bytes, is_temporary=True, skip_csv_header=True) + snowflake.copy_to_table( + s3_key, + target_schema, + table, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) if archive_load_files: # Copy load file to archive @@ -132,7 +143,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -140,16 +152,25 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -160,8 +181,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/mysql_to_bigquery.py b/pipelinewise/fastsync/mysql_to_bigquery.py index 4fa2e92ad..b8529f39d 100644 --- a/pipelinewise/fastsync/mysql_to_bigquery.py +++ b/pipelinewise/fastsync/mysql_to_bigquery.py @@ -14,20 +14,15 @@ from .commons.tap_mysql import FastSyncTapMySql from .commons.target_bigquery import FastSyncTargetBigquery -MAX_NUM='99999999999999999999999999999.999999999' +MAX_NUM = '99999999999999999999999999999.999999999' LOGGER = logging.getLogger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'host', - 'port', - 'user', - 'password' - ], + 'tap': ['host', 'port', 'user', 'password'], 'target': [ 'project_id', - ] + ], } LOCK = multiprocessing.Lock() @@ -36,35 +31,35 @@ def tap_type_to_target_type(mysql_type, mysql_column_type): """Data type mapping from MySQL to Bigquery""" return { - 'char':'STRING', - 'varchar':'STRING', - 'binary':'STRING', - 'varbinary':'STRING', - 'blob':'STRING', - 'tinyblob':'STRING', - 'mediumblob':'STRING', - 'longblob':'STRING', - 'geometry':'STRING', - 'text':'STRING', - 'tinytext':'STRING', - 'mediumtext':'STRING', - 'longtext':'STRING', - 'enum':'STRING', - 'int':'INT64', - 'tinyint':'BOOL' if mysql_column_type == 'tinyint(1)' else 'INT64', - 'smallint':'INT64', - 'mediumint':'INT64', - 'bigint':'INT64', - 'bit':'BOOL', - 'decimal':'NUMERIC', - 'double':'NUMERIC', - 'float':'NUMERIC', - 'bool':'BOOL', - 'boolean':'BOOL', - 'date':'TIMESTAMP', - 'datetime':'TIMESTAMP', - 'timestamp':'TIMESTAMP', - 'time':'TIME' + 'char': 'STRING', + 'varchar': 'STRING', + 'binary': 'STRING', + 'varbinary': 'STRING', + 'blob': 'STRING', + 'tinyblob': 'STRING', + 'mediumblob': 'STRING', + 'longblob': 'STRING', + 'geometry': 'STRING', + 'text': 'STRING', + 'tinytext': 'STRING', + 'mediumtext': 'STRING', + 'longtext': 'STRING', + 'enum': 'STRING', + 'int': 'INT64', + 'tinyint': 'BOOL' if mysql_column_type == 'tinyint(1)' else 'INT64', + 'smallint': 'INT64', + 'mediumint': 'INT64', + 'bigint': 'INT64', + 'bit': 'BOOL', + 'decimal': 'NUMERIC', + 'double': 'NUMERIC', + 'float': 'NUMERIC', + 'bool': 'BOOL', + 'boolean': 'BOOL', + 'date': 'TIMESTAMP', + 'datetime': 'TIMESTAMP', + 'timestamp': 'TIMESTAMP', + 'time': 'TIME', }.get(mysql_type, 'STRING') @@ -75,7 +70,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: bigquery = FastSyncTargetBigquery(args.target, args.transform) try: - filename = 'pipelinewise_fastsync_{}_{}.csv'.format(table, time.strftime('%Y%m%d-%H%M%S')) + filename = 'pipelinewise_fastsync_{}_{}.csv'.format( + table, time.strftime('%Y%m%d-%H%M%S') + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -86,11 +83,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: bookmark = utils.get_bookmark_for_table(table, args.properties, mysql) # Exporting table data, get table definitions and close connection to avoid timeouts - mysql.copy_table(table, - filepath, - compress=False, - max_num=MAX_NUM, - date_type='datetime') + mysql.copy_table( + table, filepath, compress=False, max_num=MAX_NUM, date_type='datetime' + ) file_parts = glob.glob(f'{filepath}*') size_bytes = sum([os.path.getsize(file_part) for file_part in file_parts]) bigquery_types = mysql.map_column_types_to_target(table) @@ -110,7 +105,8 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: table, size_bytes, is_temporary=True, - write_truncate=write_truncate) + write_truncate=write_truncate, + ) os.remove(file_part) # Obfuscate columns @@ -148,7 +144,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -156,16 +153,25 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -176,8 +182,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/mysql_to_postgres.py b/pipelinewise/fastsync/mysql_to_postgres.py index 0821c98a6..20d8d2ecb 100644 --- a/pipelinewise/fastsync/mysql_to_postgres.py +++ b/pipelinewise/fastsync/mysql_to_postgres.py @@ -15,18 +15,8 @@ LOGGER = Logger().get_logger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'host', - 'port', - 'user', - 'password' - ], - 'target': [ - 'host', - 'port', - 'user', - 'password' - ] + 'tap': ['host', 'port', 'user', 'password'], + 'target': ['host', 'port', 'user', 'password'], } LOCK = multiprocessing.Lock() @@ -57,7 +47,9 @@ def tap_type_to_target_type(mysql_type, mysql_column_type): 'longtext': 'CHARACTER VARYING', 'enum': 'CHARACTER VARYING', 'int': 'INTEGER NULL', - 'tinyint': 'BOOLEAN' if mysql_column_type and mysql_column_type.startswith('tinyint(1)') else 'SMALLINT NULL', + 'tinyint': 'BOOLEAN' + if mysql_column_type and mysql_column_type.startswith('tinyint(1)') + else 'SMALLINT NULL', 'smallint': 'SMALLINT NULL', 'mediumint': 'INTEGER NULL', 'bigint': 'BIGINT NULL', @@ -84,7 +76,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: postgres = FastSyncTargetPostgres(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -104,10 +98,14 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Postgres postgres.drop_table(target_schema, table, is_temporary=True) - postgres.create_table(target_schema, table, postgres_columns, primary_key, is_temporary=True) + postgres.create_table( + target_schema, table, postgres_columns, primary_key, is_temporary=True + ) # Load into Postgres table - postgres.copy_to_table(filepath, target_schema, table, size_bytes, is_temporary=True) + postgres.copy_to_table( + filepath, target_schema, table, size_bytes, is_temporary=True + ) os.remove(filepath) # Obfuscate columns @@ -144,7 +142,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -152,7 +151,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Create target schemas sequentially, Postgres doesn't like it running in parallel postgres_target = FastSyncTargetPostgres(args.target, args.transform) @@ -161,11 +164,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -176,8 +184,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/mysql_to_redshift.py b/pipelinewise/fastsync/mysql_to_redshift.py index 8bb948f32..94ac6afbd 100644 --- a/pipelinewise/fastsync/mysql_to_redshift.py +++ b/pipelinewise/fastsync/mysql_to_redshift.py @@ -15,20 +15,8 @@ LOGGER = Logger().get_logger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'host', - 'port', - 'user', - 'password' - ], - 'target': [ - 'host', - 'port', - 'user', - 'password', - 'dbname', - 's3_bucket' - ] + 'tap': ['host', 'port', 'user', 'password'], + 'target': ['host', 'port', 'user', 'password', 'dbname', 's3_bucket'], } DEFAULT_VARCHAR_LENGTH = 10000 @@ -63,7 +51,9 @@ def tap_type_to_target_type(mysql_type, mysql_column_type): 'longtext': 'CHARACTER VARYING({})'.format(LONG_VARCHAR_LENGTH), 'enum': 'CHARACTER VARYING({})'.format(DEFAULT_VARCHAR_LENGTH), 'int': 'NUMERIC NULL', - 'tinyint': 'BOOLEAN' if mysql_column_type and mysql_column_type.startswith('tinyint(1)') else 'NUMERIC NULL', + 'tinyint': 'BOOLEAN' + if mysql_column_type and mysql_column_type.startswith('tinyint(1)') + else 'NUMERIC NULL', 'smallint': 'NUMERIC NULL', 'mediumint': 'NUMERIC NULL', 'bigint': 'NUMERIC NULL', @@ -76,7 +66,7 @@ def tap_type_to_target_type(mysql_type, mysql_column_type): 'date': 'TIMESTAMP WITHOUT TIME ZONE', 'datetime': 'TIMESTAMP WITHOUT TIME ZONE', 'timestamp': 'TIMESTAMP WITHOUT TIME ZONE', - 'json': 'CHARACTER VARYING({})'.format(LONG_VARCHAR_LENGTH) + 'json': 'CHARACTER VARYING({})'.format(LONG_VARCHAR_LENGTH), }.get( mysql_type, 'CHARACTER VARYING({})'.format(DEFAULT_VARCHAR_LENGTH), @@ -89,7 +79,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: redshift = FastSyncTargetRedshift(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -113,10 +105,14 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Redshift redshift.drop_table(target_schema, table, is_temporary=True) - redshift.create_table(target_schema, table, redshift_columns, primary_key, is_temporary=True) + redshift.create_table( + target_schema, table, redshift_columns, primary_key, is_temporary=True + ) # Load into Redshift table - redshift.copy_to_table(s3_key, target_schema, table, size_bytes, is_temporary=True) + redshift.copy_to_table( + s3_key, target_schema, table, size_bytes, is_temporary=True + ) # Obfuscate columns redshift.obfuscate_columns(target_schema, table) @@ -152,7 +148,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -160,7 +157,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Create target schemas sequentially, Redshift doesn't like it running in parallel redshift = FastSyncTargetRedshift(args.target, args.transform) @@ -169,11 +170,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -184,8 +190,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/mysql_to_snowflake.py b/pipelinewise/fastsync/mysql_to_snowflake.py index 2b37ffd4e..2eb722e17 100644 --- a/pipelinewise/fastsync/mysql_to_snowflake.py +++ b/pipelinewise/fastsync/mysql_to_snowflake.py @@ -17,12 +17,7 @@ LOGGER = Logger().get_logger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'host', - 'port', - 'user', - 'password' - ], + 'tap': ['host', 'port', 'user', 'password'], 'target': [ 'account', 'dbname', @@ -31,8 +26,8 @@ 'warehouse', 's3_bucket', 'stage', - 'file_format' - ] + 'file_format', + ], } LOCK = multiprocessing.Lock() @@ -63,7 +58,9 @@ def tap_type_to_target_type(mysql_type, mysql_column_type): 'longtext': 'VARCHAR', 'enum': 'VARCHAR', 'int': 'NUMBER', - 'tinyint': 'BOOLEAN' if mysql_column_type and mysql_column_type.startswith('tinyint(1)') else 'NUMBER', + 'tinyint': 'BOOLEAN' + if mysql_column_type and mysql_column_type.startswith('tinyint(1)') + else 'NUMBER', 'smallint': 'NUMBER', 'mediumint': 'NUMBER', 'bigint': 'NUMBER', @@ -77,7 +74,7 @@ def tap_type_to_target_type(mysql_type, mysql_column_type): 'datetime': 'TIMESTAMP_NTZ', 'timestamp': 'TIMESTAMP_NTZ', 'time': 'TIME', - 'json': 'VARIANT' + 'json': 'VARIANT', }.get(mysql_type, 'VARCHAR') @@ -89,7 +86,6 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: tap_id = args.target.get('tap_id') archive_load_files = args.target.get('archive_load_files', False) - try: filename = utils.gen_export_filename(tap_id=tap_id, table=table) filepath = os.path.join(args.temp_dir, filename) @@ -102,11 +98,13 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: bookmark = utils.get_bookmark_for_table(table, args.properties, mysql) # Exporting table data, get table definitions and close connection to avoid timeouts - mysql.copy_table(table, - filepath, - split_large_files=args.target.get('split_large_files'), - split_file_chunk_size_mb=args.target.get('split_file_chunk_size_mb'), - split_file_max_chunks=args.target.get('split_file_max_chunks')) + mysql.copy_table( + table, + filepath, + split_large_files=args.target.get('split_large_files'), + split_file_chunk_size_mb=args.target.get('split_file_chunk_size_mb'), + split_file_max_chunks=args.target.get('split_file_max_chunks'), + ) file_parts = glob.glob(f'{filepath}*') size_bytes = sum([os.path.getsize(file_part) for file_part in file_parts]) snowflake_types = mysql.map_column_types_to_target(table) @@ -121,14 +119,22 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: os.remove(file_part) # Create a pattern that match all file parts by removing multipart suffix - s3_key_pattern = re.sub(r'\.part\d*$', '', s3_keys[0]) if len(s3_keys) > 0 else 'NO_FILES_TO_LOAD' + s3_key_pattern = ( + re.sub(r'\.part\d*$', '', s3_keys[0]) + if len(s3_keys) > 0 + else 'NO_FILES_TO_LOAD' + ) # Creating temp table in Snowflake snowflake.create_schema(target_schema) - snowflake.create_table(target_schema, table, snowflake_columns, primary_key, is_temporary=True) + snowflake.create_table( + target_schema, table, snowflake_columns, primary_key, is_temporary=True + ) # Load into Snowflake table - snowflake.copy_to_table(s3_key_pattern, target_schema, table, size_bytes, is_temporary=True) + snowflake.copy_to_table( + s3_key_pattern, target_schema, table, size_bytes, is_temporary=True + ) for s3_key in s3_keys: if archive_load_files: @@ -173,7 +179,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -181,16 +188,25 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -201,8 +217,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/postgres_to_bigquery.py b/pipelinewise/fastsync/postgres_to_bigquery.py index 3e75ea5e5..0539c91b7 100644 --- a/pipelinewise/fastsync/postgres_to_bigquery.py +++ b/pipelinewise/fastsync/postgres_to_bigquery.py @@ -14,58 +14,52 @@ from .commons.tap_postgres import FastSyncTapPostgres from .commons.target_bigquery import FastSyncTargetBigquery -MAX_NUM='99999999999999999999999999999.999999999' +MAX_NUM = '99999999999999999999999999999.999999999' LOGGER = logging.getLogger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'host', - 'port', - 'user', - 'password' - ], - 'target': [ - 'project_id' - ] + 'tap': ['host', 'port', 'user', 'password'], + 'target': ['project_id'], } LOCK = multiprocessing.Lock() -def tap_type_to_target_type(pg_type): + +def tap_type_to_target_type(pg_type, *_): """Data type mapping from Postgres to Bigquery""" return { - 'char':'STRING', - 'character':'STRING', - 'varchar':'STRING', - 'character varying':'STRING', - 'text':'STRING', + 'char': 'STRING', + 'character': 'STRING', + 'varchar': 'STRING', + 'character varying': 'STRING', + 'text': 'STRING', 'bit': ['BOOL', 'NUMERIC'], - 'varbit':'NUMERIC', - 'bit varying':'NUMERIC', - 'smallint':'INT64', - 'int':'INT64', - 'integer':'INT64', - 'bigint':'INT64', - 'smallserial':'INT64', - 'serial':'INT64', - 'bigserial':'INT64', - 'numeric':'NUMERIC', - 'double precision':'NUMERIC', - 'real':'NUMERIC', - 'bool':'BOOL', - 'boolean':'BOOL', - 'date':'TIMESTAMP', - 'timestamp':'TIMESTAMP', - 'timestamp without time zone':'TIMESTAMP', - 'timestamp with time zone':'TIMESTAMP', - 'time':'TIME', - 'time without time zone':'TIME', - 'time with time zone':'TIME', + 'varbit': 'NUMERIC', + 'bit varying': 'NUMERIC', + 'smallint': 'INT64', + 'int': 'INT64', + 'integer': 'INT64', + 'bigint': 'INT64', + 'smallserial': 'INT64', + 'serial': 'INT64', + 'bigserial': 'INT64', + 'numeric': 'NUMERIC', + 'double precision': 'NUMERIC', + 'real': 'NUMERIC', + 'bool': 'BOOL', + 'boolean': 'BOOL', + 'date': 'TIMESTAMP', + 'timestamp': 'TIMESTAMP', + 'timestamp without time zone': 'TIMESTAMP', + 'timestamp with time zone': 'TIMESTAMP', + 'time': 'TIME', + 'time without time zone': 'TIME', + 'time with time zone': 'TIME', # This is all uppercase, because postgres stores it in this format in information_schema.columns.data_type - 'ARRAY':'STRING', - 'json':'STRING', - 'jsonb':'STRING' + 'ARRAY': 'STRING', + 'json': 'STRING', + 'jsonb': 'STRING', }.get(pg_type, 'STRING') @@ -77,7 +71,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: try: dbname = args.tap.get('dbname') - filename = 'pipelinewise_fastsync_{}_{}_{}.csv'.format(dbname, table, time.strftime('%Y%m%d-%H%M%S')) + filename = 'pipelinewise_fastsync_{}_{}_{}.csv'.format( + dbname, table, time.strftime('%Y%m%d-%H%M%S') + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -85,14 +81,14 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: postgres.open_connection() # Get bookmark - LSN position or Incremental Key value - bookmark = utils.get_bookmark_for_table(table, args.properties, postgres, dbname=dbname) + bookmark = utils.get_bookmark_for_table( + table, args.properties, postgres, dbname=dbname + ) # Exporting table data, get table definitions and close connection to avoid timeouts - postgres.copy_table(table, - filepath, - compress=False, - max_num=MAX_NUM, - date_type='timestamp') + postgres.copy_table( + table, filepath, compress=False, max_num=MAX_NUM, date_type='timestamp' + ) file_parts = glob.glob(f'{filepath}*') size_bytes = sum([os.path.getsize(file_part) for file_part in file_parts]) @@ -113,7 +109,8 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: table, size_bytes, is_temporary=True, - write_truncate=write_truncate) + write_truncate=write_truncate, + ) os.remove(file_part) # Obfuscate columns @@ -151,7 +148,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -159,7 +157,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # if internal arg drop_pg_slot is set to True, then we drop the slot before starting resync if args.drop_pg_slot: @@ -168,11 +170,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -183,8 +190,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/postgres_to_postgres.py b/pipelinewise/fastsync/postgres_to_postgres.py index fd4e1e2a3..3e74d8a64 100644 --- a/pipelinewise/fastsync/postgres_to_postgres.py +++ b/pipelinewise/fastsync/postgres_to_postgres.py @@ -22,20 +22,15 @@ 'user', 'password', 'dbname', - 'tap_id' # tap_id is required to generate unique replication slot names + 'tap_id', # tap_id is required to generate unique replication slot names ], - 'target': [ - 'host', - 'port', - 'user', - 'password' - ] + 'target': ['host', 'port', 'user', 'password'], } LOCK = multiprocessing.Lock() -def tap_type_to_target_type(pg_type): +def tap_type_to_target_type(pg_type, *_): """Data type mapping from Postgres to Postgres""" return { 'char': 'CHARACTER VARYING', @@ -68,7 +63,7 @@ def tap_type_to_target_type(pg_type): # ARRAY is uppercase, because postgres stores it in this format in information_schema.columns.data_type 'ARRAY': 'JSONB', 'json': 'JSONB', - 'jsonb': 'JSONB' + 'jsonb': 'JSONB', }.get(pg_type, 'CHARACTER VARYING') @@ -79,7 +74,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: try: dbname = args.tap.get('dbname') - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -87,7 +84,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: postgres.open_connection() # Get bookmark - LSN position or Incremental Key value - bookmark = utils.get_bookmark_for_table(table, args.properties, postgres, dbname=dbname) + bookmark = utils.get_bookmark_for_table( + table, args.properties, postgres, dbname=dbname + ) # Exporting table data, get table definitions and close connection to avoid timeouts postgres.copy_table(table, filepath) @@ -99,10 +98,18 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Postgres postgres_target.drop_table(target_schema, table, is_temporary=True) - postgres_target.create_table(target_schema, table, postgres_target_columns, primary_key, is_temporary=True) + postgres_target.create_table( + target_schema, + table, + postgres_target_columns, + primary_key, + is_temporary=True, + ) # Load into Postgres table - postgres_target.copy_to_table(filepath, target_schema, table, size_bytes, is_temporary=True) + postgres_target.copy_to_table( + filepath, target_schema, table, size_bytes, is_temporary=True + ) os.remove(filepath) # Obfuscate columns @@ -121,8 +128,12 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Table loaded, grant select on all tables in target schema grantees = utils.get_grantees(args.target, table) - utils.grant_privilege(target_schema, grantees, postgres_target.grant_usage_on_schema) - utils.grant_privilege(target_schema, grantees, postgres_target.grant_select_on_schema) + utils.grant_privilege( + target_schema, grantees, postgres_target.grant_usage_on_schema + ) + utils.grant_privilege( + target_schema, grantees, postgres_target.grant_select_on_schema + ) return True @@ -139,7 +150,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -147,7 +159,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # if internal arg drop_pg_slot is set to True, then we drop the slot before starting resync if args.drop_pg_slot: @@ -160,11 +176,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -175,8 +196,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/postgres_to_redshift.py b/pipelinewise/fastsync/postgres_to_redshift.py index f47a1dcba..42234f3f4 100644 --- a/pipelinewise/fastsync/postgres_to_redshift.py +++ b/pipelinewise/fastsync/postgres_to_redshift.py @@ -16,20 +16,8 @@ LOGGER = Logger().get_logger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'host', - 'port', - 'user', - 'password' - ], - 'target': [ - 'host', - 'port', - 'user', - 'password', - 'dbname', - 's3_bucket' - ] + 'tap': ['host', 'port', 'user', 'password'], + 'target': ['host', 'port', 'user', 'password', 'dbname', 's3_bucket'], } DEFAULT_VARCHAR_LENGTH = 10000 @@ -39,7 +27,7 @@ LOCK = multiprocessing.Lock() -def tap_type_to_target_type(pg_type): +def tap_type_to_target_type(pg_type, *_): """Data type mapping from MySQL to Redshift""" return { 'char': 'CHARACTER VARYING({})'.format(DEFAULT_VARCHAR_LENGTH), @@ -84,7 +72,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: try: dbname = args.tap.get('dbname') - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table) @@ -92,7 +82,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: postgres.open_connection() # Get bookmark - LSN position or Incremental Key value - bookmark = utils.get_bookmark_for_table(table, args.properties, postgres, dbname=dbname) + bookmark = utils.get_bookmark_for_table( + table, args.properties, postgres, dbname=dbname + ) # Exporting table data, get table definitions and close connection to avoid timeouts postgres.copy_table(table, filepath) @@ -108,10 +100,14 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Redshift redshift.drop_table(target_schema, table, is_temporary=True) - redshift.create_table(target_schema, table, redshift_columns, primary_key, is_temporary=True) + redshift.create_table( + target_schema, table, redshift_columns, primary_key, is_temporary=True + ) # Load into Redshift table - redshift.copy_to_table(s3_key, target_schema, table, size_bytes, is_temporary=True) + redshift.copy_to_table( + s3_key, target_schema, table, size_bytes, is_temporary=True + ) # Obfuscate columns redshift.obfuscate_columns(target_schema, table) @@ -147,7 +143,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -155,7 +152,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # if internal arg drop_pg_slot is set to True, then we drop the slot before starting resync if args.drop_pg_slot: @@ -168,11 +169,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -183,8 +189,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/postgres_to_snowflake.py b/pipelinewise/fastsync/postgres_to_snowflake.py index 3d9d4eca2..1ac904515 100644 --- a/pipelinewise/fastsync/postgres_to_snowflake.py +++ b/pipelinewise/fastsync/postgres_to_snowflake.py @@ -24,7 +24,7 @@ 'user', 'password', 'dbname', - 'tap_id' # tap_id is required to generate unique replication slot names + 'tap_id', # tap_id is required to generate unique replication slot names ], 'target': [ 'account', @@ -34,14 +34,14 @@ 'warehouse', 's3_bucket', 'stage', - 'file_format' - ] + 'file_format', + ], } LOCK = multiprocessing.Lock() -def tap_type_to_target_type(pg_type): +def tap_type_to_target_type(pg_type, *_): """Data type mapping from Postgres to Snowflake""" return { 'char': 'VARCHAR', @@ -74,7 +74,7 @@ def tap_type_to_target_type(pg_type): # ARRAY is uppercase, because postgres stores it in this format in information_schema.columns.data_type 'ARRAY': 'VARIANT', 'json': 'VARIANT', - 'jsonb': 'VARIANT' + 'jsonb': 'VARIANT', }.get(pg_type, 'VARCHAR') @@ -96,14 +96,18 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: postgres.open_connection() # Get bookmark - LSN position or Incremental Key value - bookmark = utils.get_bookmark_for_table(table, args.properties, postgres, dbname=dbname) + bookmark = utils.get_bookmark_for_table( + table, args.properties, postgres, dbname=dbname + ) # Exporting table data, get table definitions and close connection to avoid timeouts - postgres.copy_table(table, - filepath, - split_large_files=args.target.get('split_large_files'), - split_file_chunk_size_mb=args.target.get('split_file_chunk_size_mb'), - split_file_max_chunks=args.target.get('split_file_max_chunks')) + postgres.copy_table( + table, + filepath, + split_large_files=args.target.get('split_large_files'), + split_file_chunk_size_mb=args.target.get('split_file_chunk_size_mb'), + split_file_max_chunks=args.target.get('split_file_max_chunks'), + ) file_parts = glob.glob(f'{filepath}*') size_bytes = sum([os.path.getsize(file_part) for file_part in file_parts]) snowflake_types = postgres.map_column_types_to_target(table) @@ -118,14 +122,22 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]: os.remove(file_part) # Create a pattern that match all file parts by removing multipart suffix - s3_key_pattern = re.sub(r'\.part\d*$', '', s3_keys[0]) if len(s3_keys) > 0 else 'NO_FILES_TO_LOAD' + s3_key_pattern = ( + re.sub(r'\.part\d*$', '', s3_keys[0]) + if len(s3_keys) > 0 + else 'NO_FILES_TO_LOAD' + ) # Creating temp table in Snowflake snowflake.create_schema(target_schema) - snowflake.create_table(target_schema, table, snowflake_columns, primary_key, is_temporary=True) + snowflake.create_table( + target_schema, table, snowflake_columns, primary_key, is_temporary=True + ) # Load into Snowflake table - snowflake.copy_to_table(s3_key_pattern, target_schema, table, size_bytes, is_temporary=True) + snowflake.copy_to_table( + s3_key_pattern, target_schema, table, size_bytes, is_temporary=True + ) for s3_key in s3_keys: if archive_load_files: @@ -170,7 +182,8 @@ def main_impl(): table_sync_excs = [] # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -178,7 +191,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # if internal arg drop_pg_slot is set to True, then we drop the slot before starting resync if args.drop_pg_slot: @@ -187,11 +204,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -202,8 +224,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), str(table_sync_excs), - pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/s3_csv_to_bigquery.py b/pipelinewise/fastsync/s3_csv_to_bigquery.py index 859f582d9..995c90017 100644 --- a/pipelinewise/fastsync/s3_csv_to_bigquery.py +++ b/pipelinewise/fastsync/s3_csv_to_bigquery.py @@ -16,20 +16,16 @@ REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'bucket', - 'start_date' - ], + 'tap': ['bucket', 'start_date'], 'target': [ - 'project_id', - ] + ], } LOCK = multiprocessing.Lock() -def tap_type_to_target_type(csv_type): +def tap_type_to_target_type(csv_type, *_): """Data type mapping from S3 csv to Bigquery""" return { @@ -38,8 +34,7 @@ def tap_type_to_target_type(csv_type): 'string': 'STRING', 'boolean': 'STRING', # The guess sometimes can be wrong, we'll use string for now. 'date': 'STRING', # The guess sometimes can be wrong, we'll use string for now. - - 'date_override': 'TIMESTAMP' # Column type to use when date_override defined in YAML + 'date_override': 'TIMESTAMP', # Column type to use when date_override defined in YAML }.get(csv_type, 'STRING') @@ -49,7 +44,9 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: bigquery = FastSyncTargetBigquery(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table_name) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table_name + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table_name) @@ -62,14 +59,23 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Bigquery bigquery.create_schema(target_schema) - bigquery.create_table(target_schema, - table_name, - bigquery_columns, - is_temporary=True, - sort_columns=True) + bigquery.create_table( + target_schema, + table_name, + bigquery_columns, + is_temporary=True, + sort_columns=True, + ) # Load into Bigquery table - bigquery.copy_to_table(filepath, target_schema, table_name, size_bytes, is_temporary=True, skip_csv_header=True) + bigquery.copy_to_table( + filepath, + target_schema, + table_name, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) os.remove(filepath) # Obfuscate columns @@ -109,7 +115,8 @@ def main_impl(): start_time = datetime.now() # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -117,17 +124,26 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Start loading tables in parallel in spawning processes by # utilising all available Pool size with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -138,8 +154,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), - str(table_sync_excs), pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/s3_csv_to_postgres.py b/pipelinewise/fastsync/s3_csv_to_postgres.py index 4b8a9ed67..3ca4a5dae 100644 --- a/pipelinewise/fastsync/s3_csv_to_postgres.py +++ b/pipelinewise/fastsync/s3_csv_to_postgres.py @@ -15,22 +15,14 @@ LOGGER = Logger().get_logger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'bucket', - 'start_date' - ], - 'target': [ - 'host', - 'port', - 'user', - 'password' - ] + 'tap': ['bucket', 'start_date'], + 'target': ['host', 'port', 'user', 'password'], } LOCK = multiprocessing.Lock() -def tap_type_to_target_type(csv_type): +def tap_type_to_target_type(csv_type, *_): """Data type mapping from S3 csv to Snowflake""" return { @@ -39,8 +31,7 @@ def tap_type_to_target_type(csv_type): 'string': 'CHARACTER VARYING', 'boolean': 'CHARACTER VARYING', # The guess sometimes can be wrong, we'll use varchar for now. 'date': 'CHARACTER VARYING', # The guess sometimes can be wrong, we'll use varchar for now. - - 'date_override': 'TIMESTAMP WITHOUT TIME ZONE' # Column type to use when date_override defined in YAML + 'date_override': 'TIMESTAMP WITHOUT TIME ZONE', # Column type to use when date_override defined in YAML }.get(csv_type, 'CHARACTER VARYING') @@ -50,7 +41,9 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: postgres = FastSyncTargetPostgres(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table_name) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table_name + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table_name) @@ -64,15 +57,24 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Postgres postgres.drop_table(target_schema, table_name, is_temporary=True) - postgres.create_table(target_schema, - table_name, - postgres_columns, - primary_key, - is_temporary=True, - sort_columns=True) + postgres.create_table( + target_schema, + table_name, + postgres_columns, + primary_key, + is_temporary=True, + sort_columns=True, + ) # Load into Postgres table - postgres.copy_to_table(filepath, target_schema, table_name, size_bytes, is_temporary=True, skip_csv_header=True) + postgres.copy_to_table( + filepath, + target_schema, + table_name, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) os.remove(filepath) # Obfuscate columns @@ -111,7 +113,8 @@ def main_impl(): start_time = datetime.now() # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -119,7 +122,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Create target schemas sequentially, Postgres doesn't like it running in parallel postgres_target = FastSyncTargetPostgres(args.target, args.transform) @@ -128,11 +135,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -143,8 +155,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), - str(table_sync_excs), pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/s3_csv_to_redshift.py b/pipelinewise/fastsync/s3_csv_to_redshift.py index 0e41f7d93..e5be651e3 100644 --- a/pipelinewise/fastsync/s3_csv_to_redshift.py +++ b/pipelinewise/fastsync/s3_csv_to_redshift.py @@ -15,24 +15,14 @@ LOGGER = Logger().get_logger(__name__) REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'bucket', - 'start_date' - ], - 'target': [ - 'host', - 'port', - 'user', - 'password', - 'dbname', - 's3_bucket' - ] + 'tap': ['bucket', 'start_date'], + 'target': ['host', 'port', 'user', 'password', 'dbname', 's3_bucket'], } LOCK = multiprocessing.Lock() -def tap_type_to_target_type(csv_type): +def tap_type_to_target_type(csv_type, *_): """Data type mapping from S3 csv to Snowflake""" return { @@ -41,8 +31,7 @@ def tap_type_to_target_type(csv_type): 'string': 'CHARACTER VARYING', 'boolean': 'CHARACTER VARYING', # The guess sometimes can be wrong, we'll use varchar for now. 'date': 'CHARACTER VARYING', # The guess sometimes can be wrong, we'll use varchar for now. - - 'date_override': 'TIMESTAMP WITHOUT TIME ZONE' # Column type to use when date_override defined in YAML + 'date_override': 'TIMESTAMP WITHOUT TIME ZONE', # Column type to use when date_override defined in YAML }.get(csv_type, 'CHARACTER VARYING') @@ -52,7 +41,9 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: redshift = FastSyncTargetRedshift(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table_name) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table_name + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table_name) @@ -70,15 +61,24 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Redshift redshift.create_schema(target_schema) - redshift.create_table(target_schema, - table_name, - redshift_columns, - primary_key, - is_temporary=True, - sort_columns=True) + redshift.create_table( + target_schema, + table_name, + redshift_columns, + primary_key, + is_temporary=True, + sort_columns=True, + ) # Load into Redshift table - redshift.copy_to_table(s3_key, target_schema, table_name, size_bytes, is_temporary=True, skip_csv_header=True) + redshift.copy_to_table( + s3_key, + target_schema, + table_name, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) # Obfuscate columns redshift.obfuscate_columns(target_schema, table_name) @@ -116,7 +116,8 @@ def main_impl(): start_time = datetime.now() # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -124,7 +125,11 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Create target schemas sequentially, Redshift doesn't like it running in parallel redshift = FastSyncTargetRedshift(args.target, args.transform) @@ -133,11 +138,16 @@ def main_impl(): # Start loading tables in parallel in spawning processes with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -148,8 +158,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), - str(table_sync_excs), pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/fastsync/s3_csv_to_snowflake.py b/pipelinewise/fastsync/s3_csv_to_snowflake.py index 04f8a2b95..f91abdf15 100644 --- a/pipelinewise/fastsync/s3_csv_to_snowflake.py +++ b/pipelinewise/fastsync/s3_csv_to_snowflake.py @@ -16,10 +16,7 @@ REQUIRED_CONFIG_KEYS = { - 'tap': [ - 'bucket', - 'start_date' - ], + 'tap': ['bucket', 'start_date'], 'target': [ 'account', 'dbname', @@ -28,14 +25,14 @@ 'warehouse', 's3_bucket', 'stage', - 'file_format' - ] + 'file_format', + ], } LOCK = multiprocessing.Lock() -def tap_type_to_target_type(csv_type): +def tap_type_to_target_type(csv_type, *_): """Data type mapping from S3 csv to Snowflake""" return { @@ -44,8 +41,7 @@ def tap_type_to_target_type(csv_type): 'string': 'VARCHAR', 'boolean': 'VARCHAR', # The guess sometimes can be wrong, we'll use varchar for now. 'date': 'VARCHAR', # The guess sometimes can be wrong, we'll use varchar for now. - - 'date_override': 'TIMESTAMP_NTZ' # Column type to use when date_override defined in YAML + 'date_override': 'TIMESTAMP_NTZ', # Column type to use when date_override defined in YAML }.get(csv_type, 'VARCHAR') @@ -55,7 +51,9 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: snowflake = FastSyncTargetSnowflake(args.target, args.transform) try: - filename = utils.gen_export_filename(tap_id=args.target.get('tap_id'), table=table_name) + filename = utils.gen_export_filename( + tap_id=args.target.get('tap_id'), table=table_name + ) filepath = os.path.join(args.temp_dir, filename) target_schema = utils.get_target_schema(args.target, table_name) @@ -73,21 +71,32 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]: # Creating temp table in Snowflake snowflake.create_schema(target_schema) - snowflake.create_table(target_schema, - table_name, - snowflake_columns, - primary_key, - is_temporary=True, - sort_columns=True) + snowflake.create_table( + target_schema, + table_name, + snowflake_columns, + primary_key, + is_temporary=True, + sort_columns=True, + ) # Load into Snowflake table - snowflake.copy_to_table(s3_key, target_schema, table_name, size_bytes, is_temporary=True, skip_csv_header=True) + snowflake.copy_to_table( + s3_key, + target_schema, + table_name, + size_bytes, + is_temporary=True, + skip_csv_header=True, + ) # Obfuscate columns snowflake.obfuscate_columns(target_schema, table_name) # Create target table and swap with the temp table in Snowflake - snowflake.create_table(target_schema, table_name, snowflake_columns, primary_key, sort_columns=True) + snowflake.create_table( + target_schema, table_name, snowflake_columns, primary_key, sort_columns=True + ) snowflake.swap_tables(target_schema, table_name) # Get bookmark @@ -120,7 +129,8 @@ def main_impl(): start_time = datetime.now() # Log start info - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- STARTING SYNC ------------------------------------------------------- @@ -128,17 +138,26 @@ def main_impl(): Total tables selected to sync : %s Pool size : %s ------------------------------------------------------- - """, args.tables, len(args.tables), pool_size) + """, + args.tables, + len(args.tables), + pool_size, + ) # Start loading tables in parallel in spawning processes by # utilising all available Pool size with multiprocessing.Pool(pool_size) as proc: table_sync_excs = list( - filter(lambda x: not isinstance(x, bool), proc.map(partial(sync_table, args=args), args.tables))) + filter( + lambda x: not isinstance(x, bool), + proc.map(partial(sync_table, args=args), args.tables), + ) + ) # Log summary end_time = datetime.now() - LOGGER.info(""" + LOGGER.info( + """ ------------------------------------------------------- SYNC FINISHED - SUMMARY ------------------------------------------------------- @@ -149,8 +168,13 @@ def main_impl(): Pool size : %s Runtime : %s ------------------------------------------------------- - """, len(args.tables), len(args.tables) - len(table_sync_excs), - str(table_sync_excs), pool_size, end_time - start_time) + """, + len(args.tables), + len(args.tables) - len(table_sync_excs), + str(table_sync_excs), + pool_size, + end_time - start_time, + ) if len(table_sync_excs) > 0: sys.exit(1) diff --git a/pipelinewise/logger.py b/pipelinewise/logger.py index 2a1a95cb6..2dd32e64a 100644 --- a/pipelinewise/logger.py +++ b/pipelinewise/logger.py @@ -4,6 +4,7 @@ from logging.config import fileConfig from pathlib import Path + # pylint: disable=too-few-public-methods class Logger: """PipelineWise logger class""" diff --git a/pipelinewise/utils.py b/pipelinewise/utils.py index 3746d1d09..1f3a8d6e1 100644 --- a/pipelinewise/utils.py +++ b/pipelinewise/utils.py @@ -4,7 +4,9 @@ from typing import Optional -def safe_column_name(name: Optional[str], quote_character: Optional[str]=None) -> Optional[str]: +def safe_column_name( + name: Optional[str], quote_character: Optional[str] = None +) -> Optional[str]: """ Makes column name safe by capitalizing and wrapping it in double quotes Args: diff --git a/pylintrc b/pylintrc index 15e7bc7c8..a290fcb40 100644 --- a/pylintrc +++ b/pylintrc @@ -162,7 +162,6 @@ enable=import-error, nonzero-method, t-method, setslice-method, - old-division, logging-format-truncated, logging-too-few-args, logging-too-many-args, @@ -455,10 +454,10 @@ module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ # Regular expression matching correct method names -method-rgx=[a-z_][a-z0-9_]{2,30}$ +method-rgx=[a-z_][a-z0-9_]{2,80}$ # Naming hint for method names -method-name-hint=[a-z_][a-z0-9_]{2,30}$ +method-name-hint=[a-z_][a-z0-9_]{2,80}$ # Regular expression which should only match function or class names that do # not require a docstring. diff --git a/pytest.ini b/pytest.ini index 231bc76da..c8e77ebbc 100644 --- a/pytest.ini +++ b/pytest.ini @@ -4,3 +4,7 @@ filterwarnings = ignore::DeprecationWarning:botocore ignore::DeprecationWarning:ansible ignore::DeprecationWarning:tabulate + ignore::DeprecationWarning:_yaml + ignore::DeprecationWarning:messytables + ignore::DeprecationWarning:setuptools + ignore::DeprecationWarning:pandas diff --git a/scripts/check_any_file_changed.py b/scripts/check_any_file_changed.py new file mode 100644 index 000000000..6362239b1 --- /dev/null +++ b/scripts/check_any_file_changed.py @@ -0,0 +1,27 @@ +import json +import os +import sys + +import requests + + +def main(): + with open(os.environ["GITHUB_EVENT_PATH"], mode="r", encoding="utf-8") as f: + gh_event_data = json.load(f) + PR_URL = gh_event_data["pull_request"]["url"] + + with requests.get( + f"{PR_URL}/files", + headers={"Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}"}, + ) as resp: + files_changed = [f["filename"] for f in resp.json()] + + for f in sys.argv[1:]: + if f in files_changed: + sys.exit(0) + + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/ci_check_no_file_changes.sh b/scripts/ci_check_no_file_changes.sh index 673a73f20..a130cebc6 100755 --- a/scripts/ci_check_no_file_changes.sh +++ b/scripts/ci_check_no_file_changes.sh @@ -42,11 +42,13 @@ REGEXES=() for CHECK in "$@" do if [[ ${CHECK} == "python" ]]; then - REGEX="(^tests\/|^pipelinewise\/|^singer-connectors\/|^setup\.py)" + REGEX="(^tests\/|^pipelinewise\/|^singer-connectors\/|^setup\.py|^Makefile)" echo "Searching for changes in python files" + elif [[ ${CHECK} == "doc" ]]; then - REGEX="(^docs\/|.circleci/publish_docs.sh)" + REGEX="(^docs\/|^scripts/publish_docs.sh)" echo "Searching for changes in documentation files" + else echo "Invalid check: \"${CHECK}\". Falling back to exiting with FAILURE code" exit 1 diff --git a/.circleci/publish_docs.sh b/scripts/publish_docs.sh similarity index 89% rename from .circleci/publish_docs.sh rename to scripts/publish_docs.sh index 77644569f..aa58025bd 100755 --- a/.circleci/publish_docs.sh +++ b/scripts/publish_docs.sh @@ -11,7 +11,7 @@ set -e # from the top level of the package and not in the subdirectory... # Shouldn't ever be an issue the way we've got this setup, and you'll # want to change it a bit to make it work with your project structure. -if [[ ! -f .circleci/config.yml ]]; then +if [[ ! -d .github/workflows ]]; then echo "This must be run from the gh_doc_automation project directory" exit 1 fi @@ -37,6 +37,7 @@ python3 -m venv ~/venv-doc . ~/venv-doc/bin/activate pip install --upgrade pip pip install sphinx sphinx-rtd-theme +pip install -e . # CD into docs, make them. If you're not using Sphinx, you'll probably # have a different build script. @@ -51,11 +52,13 @@ mv docs/_build/html ./ git stash # Checkout our gh-pages branch, remove everything but .git +echo "Checking out gh-pages branch ..." git checkout gh-pages git pull origin gh-pages # Make sure to set the credentials! You'll need these environment vars -# set in the "Environment Variables" section in Circle CI +# set in the "Environment Variables" section in CI +echo "Configuring git creds ..." git config user.email "$GH_EMAIL" > /dev/null 2>&1 git config user.name "$GH_NAME" > /dev/null 2>&1 @@ -92,13 +95,14 @@ rm -r html/ # Add everything, get ready for commit. But only do it if we're on # master. If you want to deploy on different branches, you can change # this. -if [[ "$CIRCLE_BRANCH" =~ ^master$|^[0-9]+\.[0-9]+\.X$ ]]; then +echo "Current branch ref: $GITHUB_REF" +if [[ "$GITHUB_REF" =~ ^refs/heads/master$|^[0-9]+\.[0-9]+\.X$ ]]; then git add --all # Make sure "|| echo" is at the end to avoid error codes when no changes to commit - git commit -m "[ci skip] publishing updated documentation..." || echo + git commit -m "[ci skip] publishing updated documentation..." || echo # We have to re-add the origin with the GH_TOKEN credentials. You - # will need this SSH key in your environment variables on Circle. + # will need this SSH key in your environment variables on CI. # Make sure you change the .git pattern at the end! git remote rm origin git remote add origin https://"$GH_NAME":"$GH_TOKEN"@github.com/transferwise/pipelinewise.git @@ -106,5 +110,5 @@ if [[ "$CIRCLE_BRANCH" =~ ^master$|^[0-9]+\.[0-9]+\.X$ ]]; then # NOW we should be able to push it git push origin gh-pages else - echo "Not on master, so won't push doc" + echo "Not on master branch, so won't push doc" fi diff --git a/setup.py b/setup.py index 4803cd0ab..c8be13e51 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,8 @@ LONG_DESCRIPTION = f.read() setup(name='pipelinewise', - version='0.34.1', + python_requires='>=3.7<3.10', + version='0.41.0', description='PipelineWise', long_description=LONG_DESCRIPTION, long_description_content_type='text/markdown', @@ -19,32 +20,34 @@ install_requires=[ 'argparse==1.4.0', 'tabulate==0.8.9', - 'PyYAML==5.4.1', - 'ansible==4.2.0', - 'Jinja2==2.11.3', - 'joblib==1.0.0', + 'PyYAML==6.0', + 'ansible-core==2.11.8', + 'Jinja2==3.0.2', + 'joblib==1.1.0', 'PyMySQL==0.7.11', 'psycopg2-binary==2.8.6', 'snowflake-connector-python[pandas]==2.4.6', - 'google-cloud-bigquery==1.24.0', + 'google-cloud-bigquery==2.31.0', 'pipelinewise-singer-python==1.*', 'singer-encodings==0.0.*', 'messytables==0.15.*', 'python-pidfile==3.0.0', - 'pre-commit==2.13.0', + 'pre-commit==2.15.0', 'pymongo>=3.10,<3.13', - 'tzlocal>=2.0,<2.2', + 'tzlocal>=2.0,<4.1', 'slackclient>=2.7,<2.10', - 'psutil==5.8.0' + 'psutil==5.8.0', + 'ujson==5.1.0', + 'dnspython==2.1.*', ], extras_require={ 'test': [ - 'pytest==6.2.4', + 'flake8==4.0.1', + 'pytest==6.2.5', 'pytest-dependency==0.4.0', - 'pytest-cov==2.12.1', - 'python-dotenv==0.18.0', - 'mock==4.0.3', - 'pylint==2.8.3', + 'pytest-cov==3.0.0', + 'python-dotenv==0.19.1', + 'pylint==2.10.2', 'unify==0.5' ] }, diff --git a/singer-connectors/tap-adwords/pre_requirements.txt b/singer-connectors/tap-adwords/pre_requirements.txt new file mode 100644 index 000000000..3258b5277 --- /dev/null +++ b/singer-connectors/tap-adwords/pre_requirements.txt @@ -0,0 +1,3 @@ +# setuptools>58.0.0 is not compatible with googleads==17.0.0 +# remove this file whenever tap-adwards upgrades googleads version +setuptools<=57.0.5 \ No newline at end of file diff --git a/singer-connectors/tap-github/requirements.txt b/singer-connectors/tap-github/requirements.txt index ed1090806..05ec4f45f 100644 --- a/singer-connectors/tap-github/requirements.txt +++ b/singer-connectors/tap-github/requirements.txt @@ -1 +1 @@ -pipelinewise-tap-github==1.0.0 +pipelinewise-tap-github==1.0.3 diff --git a/singer-connectors/tap-jira/requirements.txt b/singer-connectors/tap-jira/requirements.txt index ec8ace747..38e8dda4a 100644 --- a/singer-connectors/tap-jira/requirements.txt +++ b/singer-connectors/tap-jira/requirements.txt @@ -1 +1 @@ -tap-jira==2.0.0 +tap-jira==2.0.1 diff --git a/singer-connectors/tap-kafka/requirements.txt b/singer-connectors/tap-kafka/requirements.txt index 493da5f1d..ba2ab2c0e 100644 --- a/singer-connectors/tap-kafka/requirements.txt +++ b/singer-connectors/tap-kafka/requirements.txt @@ -1 +1 @@ -pipelinewise-tap-kafka==4.0.0 +pipelinewise-tap-kafka==5.1.0 diff --git a/singer-connectors/tap-mongodb/requirements.txt b/singer-connectors/tap-mongodb/requirements.txt index 90593b95c..162130640 100644 --- a/singer-connectors/tap-mongodb/requirements.txt +++ b/singer-connectors/tap-mongodb/requirements.txt @@ -1 +1 @@ -pipelinewise-tap-mongodb==1.2.0 +pipelinewise-tap-mongodb==1.3.0 \ No newline at end of file diff --git a/singer-connectors/tap-postgres/requirements.txt b/singer-connectors/tap-postgres/requirements.txt index 18712a76a..9d032ea9b 100644 --- a/singer-connectors/tap-postgres/requirements.txt +++ b/singer-connectors/tap-postgres/requirements.txt @@ -1 +1 @@ -pipelinewise-tap-postgres==1.8.0 +pipelinewise-tap-postgres==1.8.2 diff --git a/singer-connectors/tap-s3-csv/requirements.txt b/singer-connectors/tap-s3-csv/requirements.txt index cefd110c4..3e27e7d03 100644 --- a/singer-connectors/tap-s3-csv/requirements.txt +++ b/singer-connectors/tap-s3-csv/requirements.txt @@ -1 +1 @@ -pipelinewise-tap-s3-csv==1.2.2 +pipelinewise-tap-s3-csv==2.0.0 diff --git a/singer-connectors/target-bigquery/requirements.txt b/singer-connectors/target-bigquery/requirements.txt index 24832ec8b..541908aee 100644 --- a/singer-connectors/target-bigquery/requirements.txt +++ b/singer-connectors/target-bigquery/requirements.txt @@ -1 +1 @@ -pipelinewise-target-bigquery==1.1.1 +pipelinewise-target-bigquery==1.2.0 diff --git a/singer-connectors/target-postgres/requirements.txt b/singer-connectors/target-postgres/requirements.txt index 6178a16aa..0c0edb51e 100644 --- a/singer-connectors/target-postgres/requirements.txt +++ b/singer-connectors/target-postgres/requirements.txt @@ -1 +1 @@ -pipelinewise-target-postgres==2.1.0 +pipelinewise-target-postgres==2.1.1 diff --git a/singer-connectors/target-s3-csv/requirements.txt b/singer-connectors/target-s3-csv/requirements.txt index 8812bfac6..72ee8686f 100644 --- a/singer-connectors/target-s3-csv/requirements.txt +++ b/singer-connectors/target-s3-csv/requirements.txt @@ -1 +1 @@ -pipelinewise-target-s3-csv==1.4.0 +pipelinewise-target-s3-csv==1.5.0 diff --git a/singer-connectors/target-snowflake/requirements.txt b/singer-connectors/target-snowflake/requirements.txt index 3db380e2a..bd53e5232 100644 --- a/singer-connectors/target-snowflake/requirements.txt +++ b/singer-connectors/target-snowflake/requirements.txt @@ -1 +1 @@ -pipelinewise-target-snowflake==1.13.1 +pipelinewise-target-snowflake==1.15.0 diff --git a/singer-connectors/transform-field/requirements.txt b/singer-connectors/transform-field/requirements.txt index 428e947f9..4bb4c2de4 100644 --- a/singer-connectors/transform-field/requirements.txt +++ b/singer-connectors/transform-field/requirements.txt @@ -1 +1 @@ -pipelinewise-transform-field==2.1.0 +pipelinewise-transform-field==2.3.0 diff --git a/tests/db/mongodb_data/all_datatypes.bson.gz b/tests/db/mongodb_data/all_datatypes.bson.gz new file mode 100644 index 000000000..a59b54a3e Binary files /dev/null and b/tests/db/mongodb_data/all_datatypes.bson.gz differ diff --git a/tests/db/tap_mongodb.sh b/tests/db/tap_mongodb.sh index 17952b3c2..3bd2410ca 100755 --- a/tests/db/tap_mongodb.sh +++ b/tests/db/tap_mongodb.sh @@ -7,6 +7,7 @@ PWD="$(dirname "$0")" TEST_DB_DATA_1=${PWD}/mongodb_data/listings.csv TEST_DB_DATA_2=${PWD}/mongodb_data/my_collection.bson.gz +TEST_DB_DATA_3=${PWD}/mongodb_data/all_datatypes.bson.gz echo "Building test Mongodb database..." # To run this script some environment variables must be set. @@ -35,3 +36,10 @@ mongorestore --uri ${URL} \ --drop \ --gzip \ ${TEST_DB_DATA_2} + +mongorestore --uri ${URL} \ + --db ${TAP_MONGODB_DB} \ + --collection all_datatypes \ + --drop \ + --gzip \ + ${TEST_DB_DATA_3} diff --git a/tests/db/tap_mysql_data.sql b/tests/db/tap_mysql_data.sql index 033d72aaf..deec5c03a 100644 --- a/tests/db/tap_mysql_data.sql +++ b/tests/db/tap_mysql_data.sql @@ -1733,6 +1733,44 @@ VALUES ('x', UNLOCK TABLES; +-- +-- Table structure for table `customers` +-- + +DROP TABLE IF EXISTS `customers`; +CREATE TABLE `customers` ( + id INT AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(100), + phone VARCHAR(100), + email TEXT +) +ENGINE=MyISAM AUTO_INCREMENT=0 DEFAULT CHARSET=utf8; + +LOCK TABLES `customers` WRITE; +/*!40000 ALTER TABLE `customers` DISABLE KEYS */; +INSERT INTO customers (name, phone, email) VALUES ('Martelle Cristoforetti', '2071506307', 'mcristoforetti0@accuweather.com'), + ('Joelynn Cawthorne', '9764917766', 'jcawthorne1@bizjournals.com'), + ('Gerrie Gillie', '7907324372', 'ggillie2@acquirethisname.com'), + ('Sheilakathryn Muge', '7195638381', 'smuge3@umn.edu'), + ('Fedora Mellanby', '4842466494', 'fmellanby4@fema.gov'), + ('Dall Goodbourn', '4977729268', 'dgoodbourn5@dailymail.co.uk'), + ('Helaina Van Halen', '5148995067', 'hvan6@hp.com'), + ('Noelle Knight', NULL, 'nknight7@google.co.uk'), + ('Anetta Doodney', '4669478802', 'adoodney8@1688.com'), + ('Almira Clinch', '2235449869', 'aclinch9@netvibes.com'), + ('Aidan Bachnic', '8479230171', 'abachnica@msu.edu'), + ('Biddy Shirt', '8109841957', 'bshirtb@nps.gov'), + ('Robinson Matuszinski', '7587994157', 'rmatuszinskic@ed.gov'), + ('Pall Porcher', '2719539572', 'pporcherd@thetimes.co.uk'), + ('Briggs Fyall', '5431890133', 'bfyalle@a8.net'), + ('Shara Eversfield', '1796055914', 'seversfieldf@prlog.org'), + ('Abbot Scowcraft', '1658501516', 'ascowcraftg@shutterfly.com'), + ('Ransell Fardo', '9584912534', 'rfardoh@de.vu'), + ('Leonard Buche', '9391842560', 'lbuchei@netscape.com'), + ('Xylia Adnet', '6784481146', 'xadnetj@auda.org.au'); +/*!40000 ALTER TABLE `customers` ENABLE KEYS */; +UNLOCK TABLES; + /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; diff --git a/tests/db/tap_mysql_db.sh b/tests/db/tap_mysql_db.sh index 60d07d3c7..d00d5a5e3 100755 --- a/tests/db/tap_mysql_db.sh +++ b/tests/db/tap_mysql_db.sh @@ -22,12 +22,56 @@ fi # Grant Replication client and replication slave privileges that # requires for LOG_BASED CDC replication -export MYSQL_PWD=${TAP_MYSQL_ROOT_PASSWORD} -mysql --protocol TCP --host ${TAP_MYSQL_HOST} --port ${TAP_MYSQL_PORT} --user root -e "GRANT REPLICATION CLIENT, REPLICATION SLAVE ON *.* TO ${TAP_MYSQL_USER}" -# Grant insert privileges for testing -mysql --protocol TCP --host ${TAP_MYSQL_HOST} --port ${TAP_MYSQL_PORT} --user root -e "GRANT INSERT ON *.* TO ${TAP_MYSQL_USER}" +echo "SETTING UP MYSQL PRIMARY SERVER FOR REPLICATION" + +mysql --protocol TCP \ +--host ${TAP_MYSQL_HOST} \ +--port ${TAP_MYSQL_PORT} \ +--user root \ +--password=${TAP_MYSQL_ROOT_PASSWORD} \ +-e "CREATE DATABASE IF NOT EXISTS ${TAP_MYSQL_REPLICA_DB}; GRANT ALL PRIVILEGES ON ${TAP_MYSQL_REPLICA_DB}.* TO ${TAP_MYSQL_USER}; GRANT REPLICATION CLIENT, REPLICATION SLAVE ON *.* TO ${TAP_MYSQL_USER}; FLUSH PRIVILEGES;" + +echo "SET UP MYSQL REPLICA SERVER FOR REPLICATION" + +mysql --protocol TCP \ +--host ${TAP_MYSQL_REPLICA_HOST} \ +--port ${TAP_MYSQL_REPLICA_PORT} \ +--user root \ +--password=${TAP_MYSQL_REPLICA_ROOT_PASSWORD} \ +-e "GRANT REPLICATION CLIENT, REPLICATION SLAVE ON *.* TO ${TAP_MYSQL_REPLICA_USER}; GRANT SUPER ON *.* TO ${TAP_MYSQL_REPLICA_USER}; FLUSH PRIVILEGES;" + +echo "GETTING MYSQL PRIMARY SERVER LOG INFO" + +MASTER_LOG_STATUS=`mysql --protocol TCP --host ${TAP_MYSQL_HOST} --port ${TAP_MYSQL_PORT} --user root --password=${TAP_MYSQL_ROOT_PASSWORD} -e "SHOW MASTER STATUS;"` +CURRENT_LOG=`echo $MASTER_LOG_STATUS | awk '{print $5}'` +CURRENT_POS=`echo $MASTER_LOG_STATUS | awk '{print $6}'` + +echo "STARTING MYSQL REPLICATION" + +mysql --protocol TCP \ +--host=${TAP_MYSQL_REPLICA_HOST} \ +--port ${TAP_MYSQL_REPLICA_PORT} \ +--user ${TAP_MYSQL_REPLICA_USER} \ +--password=${TAP_MYSQL_REPLICA_PASSWORD} \ +-e "STOP SLAVE; CHANGE MASTER TO MASTER_HOST='${TAP_MYSQL_HOST}',MASTER_USER='${TAP_MYSQL_USER}',MASTER_PASSWORD='${TAP_MYSQL_PASSWORD}',MASTER_LOG_FILE='${CURRENT_LOG}',MASTER_LOG_POS=${CURRENT_POS}; START SLAVE;" # Download the sample database and build it -export MYSQL_PWD=${TAP_MYSQL_PASSWORD} -mysql --protocol TCP --host ${TAP_MYSQL_HOST} --port ${TAP_MYSQL_PORT} --user ${TAP_MYSQL_USER} ${TAP_MYSQL_DB} < ${TEST_DB_SQL} \ No newline at end of file + +echo "DUMPING DATA INTO PRIMARY MYSQL DATABASE" + +mysql --protocol TCP \ +--host ${TAP_MYSQL_HOST} \ +--port ${TAP_MYSQL_PORT} \ +--user ${TAP_MYSQL_USER} \ +--password=${TAP_MYSQL_PASSWORD} \ +${TAP_MYSQL_DB} < ${TEST_DB_SQL} + +echo "DUMPING DATA INTO PRIMARY MYSQL DATABASE2" + +mysql --protocol TCP \ +--host ${TAP_MYSQL_HOST} \ +--port ${TAP_MYSQL_PORT} \ +--user ${TAP_MYSQL_USER} \ +--password=${TAP_MYSQL_PASSWORD} \ +${TAP_MYSQL_REPLICA_DB} < ${TEST_DB_SQL} diff --git a/tests/db/tap_postgres_data.sql b/tests/db/tap_postgres_data.sql index 44a2bdc60..1c075e32b 100644 --- a/tests/db/tap_postgres_data.sql +++ b/tests/db/tap_postgres_data.sql @@ -20,39 +20,40 @@ CREATE TABLE edgydata( cjson json, cjsonb jsonb, cvarchar varchar, + "date" date, PRIMARY KEY (cid) ); -insert into edgydata (ctimentz, ctimetz, cjson, cjsonb, cvarchar) values - (null, null, null, null, null), - ('23:00:15', '23:00:15+00', null, null, null), - ('12:00:15', '12:00:15+00:00', null, null, null), - ('12:00:15', '12:00:15+0300', null, null, null), - ('12:00:15', '12:00:15-0300', null, null, null), - ('24:00:00', '24:00:00', null, null, null), - ('24:00:00', '24:00:00+0000', null, null, null), - ('24:00:00', '24:00:00-0100', null, null, null), - ('00:00:00', '00:00:00', null, null, null), - (null, null, null, null,'Lorem ipsum dolor sit amet'), - (null, null, null, null,'Chinese: 和毛泽东 <<重上井冈山>>. 严永欣, 一九八八年.'), - (null, null, null, null,'Thai: แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช'), - (null, null, null, null,E'Special Characters: ["/\\,!@£$%^&*()]'), - (null, null, '[]', '[]', '[]' ), - (null, null, '{}', '{}', '{}'), - (null, null, '[{}, {}]', '[{}, {}]', '[{}, {}]'), +insert into edgydata (ctimentz, ctimetz, cjson, cjsonb, cvarchar, "date") values + (null, null, null, null, null, DATE '20107-05-28'), + ('23:00:15', '23:00:15+00', null, null, null, DATE '2011-09-10'), + ('12:00:15', '12:00:15+00:00', null, null, null, DATE '2019-12-10'), + ('12:00:15', '12:00:15+0300', null, null, null, DATE '0001-09-10'), + ('12:00:15', '12:00:15-0300', null, null, null, DATE '1990-09-30'), + ('24:00:00', '24:00:00', null, null, null, null), + ('24:00:00', '24:00:00+0000', null, null, null, DATE '333333-09-30'), + ('24:00:00', '24:00:00-0100', null, null, null, DATE '1990-09-30'), + ('00:00:00', '00:00:00', null, null, null, DATE '2021-01-30'), + (null, null, null, null,'Lorem ipsum dolor sit amet', DATE '2021-01-30'), + (null, null, null, null,'Chinese: 和毛泽东 <<重上井冈山>>. 严永欣, 一九八八年.', DATE '2021-01-30'), + (null, null, null, null,'Thai: แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช', DATE '2021-01-30'), + (null, null, null, null,E'Special Characters: ["/\\,!@£$%^&*()]', DATE '2021-01-30'), + (null, null, '[]', '[]', '[]' , DATE '2021-01-30'), + (null, null, '{}', '{}', '{}', DATE '2021-01-30'), + (null, null, '[{}, {}]', '[{}, {}]', '[{}, {}]', DATE '2021-01-30'), (null, null, '[{"key": "ValueOne", "actions": []}, {"key": "ValueTwo", "actions": []}]', '[{"key": "ValueOne", "actions": []}, {"key": "ValueTwo", "actions": []}]', - '[{"key": "ValueOne", "actions": []}, {"key": "ValueTwo", "actions": []}]'), - (null, null, E'{"key": "Value\'s One"}', E'{"key": "Value\'s One"}', E'{"key": "Value\'s One"}'), + '[{"key": "ValueOne", "actions": []}, {"key": "ValueTwo", "actions": []}]', DATE '2021-01-30'), + (null, null, E'{"key": "Value\'s One"}', E'{"key": "Value\'s One"}', E'{"key": "Value\'s One"}', DATE '2021-01-30'), (null, null, E'[{"key": "Value\'s One", "actions": []},{"key": "Value\U00000027s Two", "actions": []}]', E'[{"key": "Value\'s One", "actions": []},{"key": "Value\U00000027s Two", "actions": []}]', - E'[{"key": "Value\'s One", "actions": []},{"key": "Value\U00000027s Two", "actions": []}]'), - (null, null, null, null,' '), + E'[{"key": "Value\'s One", "actions": []},{"key": "Value\U00000027s Two", "actions": []}]', DATE '2021-01-30'), + (null, null, null, null,' ', DATE '2021-01-30'), (null, null, null, null,'Enter The -Ninja'), +Ninja', DATE '2021-01-30'), (null, null, null, null,'Liewe -Maatjies'), - (null, null, null, null,'Liewe Maatjies') +Maatjies', DATE '2021-01-30'), + (null, null, null, null,'Liewe Maatjies', DATE '2021-01-30') ; COMMIT; @@ -4528,3 +4529,33 @@ values ('A', '2020-01-01 08:53:56.112248-07'), ('H', '1000-03-03 10:30:00 BC'), ('I', '50000-03-03 10:30:00') ; + +DROP TABLE IF EXISTS public.customers CASCADE; + +CREATE TABLE customers ( + id serial primary key, + name varchar(100) NOT NULL, + phone varchar(10), + email text +); + +INSERT INTO customers (name, phone, email) VALUES ('Martelle Cristoforetti', '2071506307', 'mcristoforetti0@accuweather.com'), + ('Joelynn Cawthorne', '9764917766', 'jcawthorne1@bizjournals.com'), + ('Gerrie Gillie', '7907324372', 'ggillie2@acquirethisname.com'), + ('Sheilakathryn Muge', '7195638381', 'smuge3@umn.edu'), + ('Fedora Mellanby', '4842466494', 'fmellanby4@fema.gov'), + ('Dall Goodbourn', '4977729268', 'dgoodbourn5@dailymail.co.uk'), + ('Helaina Van Halen', '5148995067', 'hvan6@hp.com'), + ('Noelle Knight', NULL, 'nknight7@google.co.uk'), + ('Anetta Doodney', '4669478802', 'adoodney8@1688.com'), + ('Almira Clinch', '2235449869', 'aclinch9@netvibes.com'), + ('Aidan Bachnic', '8479230171', 'abachnica@msu.edu'), + ('Biddy Shirt', '8109841957', 'bshirtb@nps.gov'), + ('Robinson Matuszinski', '7587994157', 'rmatuszinskic@ed.gov'), + ('Pall Porcher', '2719539572', 'pporcherd@thetimes.co.uk'), + ('Briggs Fyall', '5431890133', 'bfyalle@a8.net'), + ('Shara Eversfield', '1796055914', 'seversfieldf@prlog.org'), + ('Abbot Scowcraft', '1658501516', 'ascowcraftg@shutterfly.com'), + ('Ransell Fardo', '9584912534', 'rfardoh@de.vu'), + ('Leonard Buche', '9391842560', 'lbuchei@netscape.com'), + ('Xylia Adnet', '6784481146', 'xadnetj@auda.org.au'); diff --git a/tests/end_to_end/helpers/assertions.py b/tests/end_to_end/helpers/assertions.py index 8cf316b8b..ce989b613 100644 --- a/tests/end_to_end/helpers/assertions.py +++ b/tests/end_to_end/helpers/assertions.py @@ -26,7 +26,9 @@ def assert_run_tap_success(tap, target, sync_engines, profiling=False): assert_state_file_valid(target, tap, log_file) if profiling: - assert_profiling_stats_files_created(stdout, 'run_tap', sync_engines, tap, target) + assert_profiling_stats_files_created( + stdout, 'run_tap', sync_engines, tap, target + ) def assert_resync_tables_success(tap, target, profiling=False): @@ -45,7 +47,9 @@ def assert_resync_tables_success(tap, target, profiling=False): assert_state_file_valid(target, tap, log_file) if profiling: - assert_profiling_stats_files_created(stdout, 'sync_tables', ['fastsync'], tap, target) + assert_profiling_stats_files_created( + stdout, 'sync_tables', ['fastsync'], tap, target + ) def assert_command_success(return_code, stdout, stderr, log_path=None): @@ -57,7 +61,7 @@ def assert_command_success(return_code, stdout, stderr, log_path=None): failed_log_path = f'{log_path}.failed' # Load failed log file if exists if os.path.isfile(failed_log_path): - with open(failed_log_path, 'r') as file: + with open(failed_log_path, 'r', encoding='utf-8') as file: failed_log = file.read() print(f'STDOUT: {stdout}\nSTDERR: {stderr}\nFAILED LOG: {failed_log}') @@ -74,25 +78,32 @@ def assert_command_success(return_code, stdout, stderr, log_path=None): def assert_state_file_valid(target_name, tap_name, log_path=None): """Assert helper function to check if state file exists for a certain tap for a certain target""" - state_file = Path(f'{Path.home()}/.pipelinewise/{target_name}/{tap_name}/state.json').resolve() + state_file = Path( + f'{Path.home()}/.pipelinewise/{target_name}/{tap_name}/state.json' + ).resolve() assert os.path.isfile(state_file) # Check if state file content equals to last emitted state in log if log_path: success_log_path = f'{log_path}.success' state_in_log = None - with open(success_log_path, 'r') as log_f: - state_log_pattern = re.search(r'\nINFO STATE emitted from target: (.+\n)', '\n'.join(log_f.readlines())) + with open(success_log_path, 'r', encoding='utf-8') as log_f: + state_log_pattern = re.search( + r'\nINFO STATE emitted from target: (.+\n)', + '\n'.join(log_f.readlines()), + ) if state_log_pattern: state_in_log = state_log_pattern.groups()[-1] # If the emitted state message exists in the log then compare it to the actual state file if state_in_log: - with open(state_file, 'r') as state_f: + with open(state_file, 'r', encoding='utf-8') as state_f: assert state_in_log == ''.join(state_f.readlines()) -def assert_cols_in_table(query_runner_fn: callable, table_schema: str, table_name: str, columns: List[str]): +def assert_cols_in_table( + query_runner_fn: callable, table_schema: str, table_name: str, columns: List[str], schema_postfix: str = '' +): """Fetches the given table's columns from information_schema and tests if every given column is in the result @@ -100,16 +111,23 @@ def assert_cols_in_table(query_runner_fn: callable, table_schema: str, table_nam :param table_schema: search table in this schema :param table_name: table with the columns :param columns: list of columns to check if there are in the table's columns + :param schema_postfix: schema postfix for snowflake target """ - funcs = _map_tap_to_target_functions(None, query_runner_fn) - sql_get_columns_for_table_fn = funcs.get('target_sql_get_table_cols_fn', db.sql_get_columns_for_table) + funcs = _map_tap_to_target_functions(None, query_runner_fn, schema_postfix) + sql_get_columns_for_table_fn = funcs.get( + 'target_sql_get_table_cols_fn', db.sql_get_columns_for_table + ) sql = sql_get_columns_for_table_fn(table_schema, table_name) result = query_runner_fn(sql) cols = [res[0] for res in result] try: assert all(col in cols for col in columns) except AssertionError as ex: - ex.args += ('Error', columns, f'One ore more columns not found in target table {table_name}') + ex.args += ( + 'Error', + columns, + f'One ore more columns not found in target table {table_name}', + ) raise @@ -118,7 +136,9 @@ def _run_sql(query_runner_fn: callable, sql_query: str) -> List: return list(query_runner_fn(sql_query)) -def _map_tap_to_target_functions(tap_query_runner_fn: callable, target_query_runner_fn: callable) -> dict: +def _map_tap_to_target_functions( + tap_query_runner_fn: callable, target_query_runner_fn: callable, schema_postfix: str = '' +) -> dict: """Takes two query runner methods and creates a map with the compatible database specific functions that required to run assertions. @@ -130,21 +150,29 @@ def _map_tap_to_target_functions(tap_query_runner_fn: callable, target_query_run # tap-mysql specific attributes and functions 'run_query_tap_mysql': { 'source_schemas': ['mysql_source_db'], - 'target_schemas': ['ppw_e2e_tap_mysql'], + 'target_schemas': [f'ppw_e2e_tap_mysql{schema_postfix}'], 'source_sql_get_cols_fn': db.sql_get_columns_mysql, - 'source_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_mysql + 'source_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_mysql, + }, + # tap-mysql specific attributes and functions + 'run_query_tap_mysql_2': { + 'source_schemas': ['mysql_source_db_2'], + 'target_schemas': [f'ppw_e2e_tap_mysql_2{schema_postfix}'], + 'source_sql_get_cols_fn': db.sql_get_columns_mysql, + 'source_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_mysql, }, # tap-postgres specific attributes and functions 'run_query_tap_postgres': { 'source_schemas': ['public', 'public2'], - 'target_schemas': ['ppw_e2e_tap_postgres', 'ppw_e2e_tap_postgres_public2'], + 'target_schemas': [f'ppw_e2e_tap_postgres{schema_postfix}', + f'ppw_e2e_tap_postgres_public2{schema_postfix}'], 'source_sql_get_cols_fn': db.sql_get_columns_postgres, - 'source_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_postgres + 'source_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_postgres, }, # target-postgres specific attributes and functions 'run_query_target_postgres': { 'target_sql_get_cols_fn': db.sql_get_columns_postgres, - 'target_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_postgres + 'target_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_postgres, }, # target-snowflake specific attributes and functions 'run_query_target_snowflake': { @@ -162,35 +190,49 @@ def _map_tap_to_target_functions(tap_query_runner_fn: callable, target_query_run 'run_query_target_redshift': { 'target_sql_get_cols_fn': db.sql_get_columns_redshift, 'target_sql_dynamic_row_count_fn': db.sql_dynamic_row_count_redshift, - } + }, } # Merge the keys into one dict by tap and target query runner names if tap_query_runner_fn: - return {**f_map[tap_query_runner_fn.__name__], **f_map[target_query_runner_fn.__name__]} + return { + **f_map[tap_query_runner_fn.__name__], + **f_map[target_query_runner_fn.__name__], + } return {**f_map[target_query_runner_fn.__name__]} -def assert_row_counts_equal(tap_query_runner_fn: callable, target_query_runner_fn: callable) -> None: +def assert_row_counts_equal( + tap_query_runner_fn: callable, target_query_runner_fn: callable, schema_postfix: str = '' +) -> None: """Takes two query runner methods, counts the row numbers in every table in both the source and target databases and tests if the row counts are matching. :param tap_query_runner_fn: method to run queries in the first connection - :param target_query_runner_fn: method to run queries in the second connection""" + :param target_query_runner_fn: method to run queries in the second connection + :param schema_postfix: schema postfix for snowflake target""" # Generate a map of source and target specific functions - funcs = _map_tap_to_target_functions(tap_query_runner_fn, target_query_runner_fn) + funcs = _map_tap_to_target_functions(tap_query_runner_fn, target_query_runner_fn, schema_postfix) # Get source and target schemas source_schemas = funcs['source_schemas'] target_schemas = funcs['target_schemas'] # Generate a dynamic SQLs to count rows in source and target databases - source_dynamic_sql_row_count = funcs['source_sql_dynamic_row_count_fn'](source_schemas) - target_dynamic_sql_row_count = funcs['target_sql_dynamic_row_count_fn'](target_schemas) + source_dynamic_sql_row_count = funcs['source_sql_dynamic_row_count_fn']( + source_schemas + ) + target_dynamic_sql_row_count = funcs['target_sql_dynamic_row_count_fn']( + target_schemas + ) # Count rows - source_sql_row_count = _run_sql(tap_query_runner_fn, source_dynamic_sql_row_count)[0][0] - target_sql_row_count = _run_sql(target_query_runner_fn, target_dynamic_sql_row_count)[0][0] + source_sql_row_count = _run_sql(tap_query_runner_fn, source_dynamic_sql_row_count)[ + 0 + ][0] + target_sql_row_count = _run_sql( + target_query_runner_fn, target_dynamic_sql_row_count + )[0][0] # Run the generated SQLs row_counts_in_source = _run_sql(tap_query_runner_fn, source_sql_row_count) @@ -200,11 +242,8 @@ def assert_row_counts_equal(tap_query_runner_fn: callable, target_query_runner_f # we fix that by renaming the source tables to names that the target would accept if 'target_sql_safe_name_fn' in funcs: row_counts_in_source = [ - ( - funcs['target_sql_safe_name_fn'](table), - row_count - ) - for (table,row_count) in row_counts_in_source + (funcs['target_sql_safe_name_fn'](table), row_count) + for (table, row_count) in row_counts_in_source ] # Compare the two dataset @@ -212,10 +251,13 @@ def assert_row_counts_equal(tap_query_runner_fn: callable, target_query_runner_f # pylint: disable=too-many-locals -def assert_all_columns_exist(tap_query_runner_fn: callable, - target_query_runner_fn: callable, - column_type_mapper_fn: callable = None, - ignore_cols: Union[Set, List] = None) -> None: +def assert_all_columns_exist( + tap_query_runner_fn: callable, + target_query_runner_fn: callable, + column_type_mapper_fn: callable = None, + ignore_cols: Union[Set, List] = None, + schema_postfix: str = '', +) -> None: """Takes two query runner methods, gets the columns list for every table in both the source and target database and tests if every column in source exists in the target database. Some taps have unsupported column types and these are not part of the schemas published to the target thus @@ -224,9 +266,10 @@ def assert_all_columns_exist(tap_query_runner_fn: callable, :param tap_query_runner_fn: method to run queries in the first connection :param target_query_runner_fn: method to run queries in the second connection :param column_type_mapper_fn: method to convert source to target column types - :param ignore_cols: List or set of columns to ignore if we know target table won't have them""" + :param ignore_cols: List or set of columns to ignore if we know target table won't have them + :param schema_postfix: Schema postfix for Snowflake targets""" # Generate a map of source and target specific functions - funcs = _map_tap_to_target_functions(tap_query_runner_fn, target_query_runner_fn) + funcs = _map_tap_to_target_functions(tap_query_runner_fn, target_query_runner_fn, schema_postfix) # Get source and target schemas source_schemas = funcs['source_schemas'] @@ -251,10 +294,7 @@ def _cols_list_to_dict(cols: List) -> dict: cols_dict = {} for col in cols: col_props = col.split(':') - cols_dict[col_props[0]] = { - 'type': col_props[1], - 'type_extra': col_props[2] - } + cols_dict[col_props[0]] = {'type': col_props[1], 'type_extra': col_props[2]} return cols_dict @@ -270,7 +310,11 @@ def _cols_list_to_dict(cols: List) -> dict: source_cols = table_cols[1].lower().split(';') try: - target_cols = next(t[1] for t in target_table_cols if t[0].lower() == table_to_check).lower().split(';') + target_cols = ( + next(t[1] for t in target_table_cols if t[0].lower() == table_to_check) + .lower() + .split(';') + ) except StopIteration as ex: ex.args += ('Error', f'{table_to_check} table not found in target') raise @@ -287,25 +331,38 @@ def _cols_list_to_dict(cols: List) -> dict: try: assert col_name in target_cols_dict except AssertionError as ex: - ex.args += ('Error', f'{col_name} column not found in target table {table_to_check}') + ex.args += ( + 'Error', + f'{col_name} column not found in target table {table_to_check}', + ) raise # Check if column type is expected in the target table, if mapper function provided if column_type_mapper_fn: try: target_col = target_cols_dict[col_name] - exp_col_type = column_type_mapper_fn(col_props['type'], col_props['type_extra']) \ - .replace(' NULL', '').lower() + exp_col_type = ( + column_type_mapper_fn( + col_props['type'], col_props['type_extra'] + ) + .replace(' NULL', '') + .lower() + ) act_col_type = target_col['type'].lower() assert act_col_type == exp_col_type except AssertionError as ex: - ex.args += ('Error', f'{col_name} column type is not as expected. ' - f'Expected: {exp_col_type} ' - f'Actual: {act_col_type}') + ex.args += ( + 'Error', + f'{col_name} column type is not as expected. ' + f'Expected: {exp_col_type} ' + f'Actual: {act_col_type}', + ) raise -def assert_date_column_naive_in_target(target_query_runner_fn, column_name, full_table_name): +def assert_date_column_naive_in_target( + target_query_runner_fn, column_name, full_table_name +): """ Checks if all dates in the given column are naive,i.e no timezone Args: @@ -313,19 +370,20 @@ def assert_date_column_naive_in_target(target_query_runner_fn, column_name, full column_name: column of timestamp type full_table_name: fully qualified table name """ - dates = target_query_runner_fn( - f'SELECT {column_name} FROM {full_table_name};') + dates = target_query_runner_fn(f'SELECT {column_name} FROM {full_table_name};') for date in dates: if date[0] is not None: assert date[0].tzinfo is None -def assert_profiling_stats_files_created(stdout: str, - command: str, - sync_engines: List = None, - tap: Union[str, List[str]] = None, - target: str = None): +def assert_profiling_stats_files_created( + stdout: str, + command: str, + sync_engines: List = None, + tap: Union[str, List[str]] = None, + target: str = None, +): """ Asserts that profiling pstat files were created by checking their existence Args: @@ -339,7 +397,10 @@ def assert_profiling_stats_files_created(stdout: str, profiler_dir = tasks.find_profiling_folder(stdout) # crawl the folder looking for pstat files and strip the folder name from the file name - pstat_files = {file[len(f'{profiler_dir}/'):] for file in glob.iglob(f'{profiler_dir}/*.pstat')} + pstat_files = { + file[len(f'{profiler_dir}/'):] + for file in glob.iglob(f'{profiler_dir}/*.pstat') + } assert f'pipelinewise_{command}.pstat' in pstat_files diff --git a/tests/end_to_end/helpers/db.py b/tests/end_to_end/helpers/db.py index be363efe1..700b58305 100644 --- a/tests/end_to_end/helpers/db.py +++ b/tests/end_to_end/helpers/db.py @@ -11,15 +11,14 @@ from pipelinewise.fastsync.commons.target_bigquery import safe_name + # pylint: disable=too-many-arguments def run_query_postgres(query, host, port, user, password, database): """Run and SQL query in a postgres database""" result_rows = [] - with psycopg2.connect(host=host, - port=port, - user=user, - password=password, - database=database) as conn: + with psycopg2.connect( + host=host, port=port, user=user, password=password, database=database + ) as conn: conn.set_session(autocommit=True) with conn.cursor() as cur: cur.execute(query) @@ -31,13 +30,15 @@ def run_query_postgres(query, host, port, user, password, database): def run_query_mysql(query, host, port, user, password, database): """Run and SQL query in a mysql database""" result_rows = [] - with pymysql.connect(host=host, - port=port, - user=user, - password=password, - database=database, - charset='utf8mb4', - cursorclass=pymysql.cursors.Cursor) as cur: + with pymysql.connect( + host=host, + port=port, + user=user, + password=password, + database=database, + charset='utf8mb4', + cursorclass=pymysql.cursors.Cursor, + ) as cur: cur.execute(query) if cur.rowcount > 0: result_rows = cur.fetchall() @@ -47,12 +48,14 @@ def run_query_mysql(query, host, port, user, password, database): def run_query_snowflake(query, account, database, warehouse, user, password): """Run and SQL query in a snowflake database""" result_rows = [] - with snowflake.connector.connect(account=account, - database=database, - warehouse=warehouse, - user=user, - password=password, - autocommit=True) as conn: + with snowflake.connector.connect( + account=account, + database=database, + warehouse=warehouse, + user=user, + password=password, + autocommit=True, + ) as conn: with conn.cursor() as cur: cur.execute(query) if cur.rowcount > 0: @@ -70,12 +73,13 @@ def delete_dataset_bigquery(dataset, project): client = bigquery.Client(project=project) client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) + def run_query_bigquery(query, project): """Run and SQL query in a BigQuery database""" client = bigquery.Client(project=project) query_job = client.query(query) - query_job.result() - return [r.values() for r in query_job] + return [r.values() for r in query_job.result()] + def run_query_redshift(query, host, port, user, password, database): """Redshift is compatible with postgres""" @@ -152,9 +156,12 @@ def sql_get_columns_snowflake(schemas: list) -> str: def sql_get_columns_bigquery(schemas: list) -> str: """Generates an SQL command that gives the list of columns of every table in a specific schema from a snowflake database""" - table_queries = ' UNION ALL '.join(f""" + table_queries = ' UNION ALL '.join( + f""" SELECT table_name, column_name, data_type - FROM `{schema}`.INFORMATION_SCHEMA.COLUMNS""" for schema in schemas) + FROM `{schema}`.INFORMATION_SCHEMA.COLUMNS""" + for schema in schemas + ) return f""" SELECT table_name, STRING_AGG(CONCAT(column_name, ':', data_type, ':'), ';' ORDER BY column_name) @@ -254,10 +261,13 @@ def sql_dynamic_row_count_snowflake(schemas: list) -> str: def sql_dynamic_row_count_bigquery(schemas: list) -> str: """Generates an SQL statement that counts the number of rows in every table in a specific schema(s) in a Snowflake database""" - table_queries = ' UNION DISTINCT '.join(f""" + table_queries = ' UNION DISTINCT '.join( + f""" SELECT table_schema, table_name FROM `{schema}`.INFORMATION_SCHEMA.TABLES - WHERE table_type = 'BASE TABLE'""" for schema in schemas) + WHERE table_type = 'BASE TABLE'""" + for schema in schemas + ) return f""" WITH table_list AS ({table_queries}) @@ -286,22 +296,26 @@ def sql_dynamic_row_count_redshift(schemas: list) -> str: ' UNION ') WITHIN GROUP ( ORDER BY tablename ) || 'ORDER BY tbl' FROM table_list - """ + """ # noqa: E501 -def get_mongodb_connection(host: str, - port: Union[str, int], - user: str, - password: str, - database: str, - auth_database: str)->Database: +def get_mongodb_connection( + host: str, + port: Union[str, int], + user: str, + password: str, + database: str, + auth_database: str, +) -> Database: """ Creates a mongoDB connection to the db to sync from Returns: Database instance with established connection """ - return pymongo.MongoClient(host=host, - port=int(port), - username=user, - password=password, - authSource=auth_database)[database] + return pymongo.MongoClient( + host=host, + port=int(port), + username=user, + password=password, + authSource=auth_database, + )[database] diff --git a/tests/end_to_end/helpers/env.py b/tests/end_to_end/helpers/env.py index fddd9c26a..2841454bd 100644 --- a/tests/end_to_end/helpers/env.py +++ b/tests/end_to_end/helpers/env.py @@ -4,6 +4,8 @@ import boto3 import shutil import subprocess +import uuid +from pathlib import Path from dotenv import load_dotenv from . import db @@ -21,6 +23,7 @@ class E2EEnv: on the supported databases""" def __init__(self, project_dir): + self.sf_schema_postfix = f'_{str(uuid.uuid4())[:8]}' self._load_env() # Generate test project YAMLs from templates @@ -43,7 +46,9 @@ def _load_env(self): If optional connector properties are not defined in ../../../dev/project/.env then the related test cases will be skipped.""" - load_dotenv(dotenv_path=os.path.join(DIR, '..', '..', '..', 'dev-project', '.env')) + load_dotenv( + dotenv_path=os.path.join(DIR, '..', '..', '..', 'dev-project', '.env') + ) self.env = { # ------------------------------------------------------------------ # Tap Postgres is a REQUIRED test connector and test database with test data available @@ -52,12 +57,27 @@ def _load_env(self): 'TAP_POSTGRES': { 'template_patterns': ['tap_postgres'], 'vars': { - 'HOST' : {'value': os.environ.get('TAP_POSTGRES_HOST'), 'required': True}, - 'PORT' : {'value': os.environ.get('TAP_POSTGRES_PORT'), 'required': True}, - 'USER' : {'value': os.environ.get('TAP_POSTGRES_USER'), 'required': True}, - 'PASSWORD' : {'value': os.environ.get('TAP_POSTGRES_PASSWORD'), 'required': True}, - 'DB' : {'value': os.environ.get('TAP_POSTGRES_DB'), 'required': True} - } + 'HOST': { + 'value': os.environ.get('TAP_POSTGRES_HOST'), + 'required': True, + }, + 'PORT': { + 'value': os.environ.get('TAP_POSTGRES_PORT'), + 'required': True, + }, + 'USER': { + 'value': os.environ.get('TAP_POSTGRES_USER'), + 'required': True, + }, + 'PASSWORD': { + 'value': os.environ.get('TAP_POSTGRES_PASSWORD'), + 'required': True, + }, + 'DB': { + 'value': os.environ.get('TAP_POSTGRES_DB'), + 'required': True, + }, + }, }, # ------------------------------------------------------------------ # Tap MySQL is a REQUIRED test connector and test database with test data available @@ -66,12 +86,18 @@ def _load_env(self): 'TAP_MYSQL': { 'template_patterns': ['tap_mysql'], 'vars': { - 'HOST' : {'value': os.environ.get('TAP_MYSQL_HOST')}, - 'PORT' : {'value': os.environ.get('TAP_MYSQL_PORT')}, - 'USER' : {'value': os.environ.get('TAP_MYSQL_USER')}, - 'PASSWORD' : {'value': os.environ.get('TAP_MYSQL_PASSWORD')}, - 'DB' : {'value': os.environ.get('TAP_MYSQL_DB')}, - } + 'HOST': {'value': os.environ.get('TAP_MYSQL_HOST')}, + 'PORT': {'value': os.environ.get('TAP_MYSQL_PORT')}, + 'USER': {'value': os.environ.get('TAP_MYSQL_USER')}, + 'PASSWORD': {'value': os.environ.get('TAP_MYSQL_PASSWORD')}, + 'DB': {'value': os.environ.get('TAP_MYSQL_DB')}, + 'DB_2': {'value': os.environ.get('TAP_MYSQL_REPLICA_DB')}, + 'REPLICA_HOST': {'value': os.environ.get('TAP_MYSQL_REPLICA_HOST')}, + 'REPLICA_PORT': {'value': os.environ.get('TAP_MYSQL_REPLICA_PORT')}, + 'REPLICA_USER': {'value': os.environ.get('TAP_MYSQL_REPLICA_USER')}, + 'REPLICA_PASSWORD': {'value': os.environ.get('TAP_MYSQL_REPLICA_PASSWORD')}, + 'REPLICA_DB': {'value': os.environ.get('TAP_MYSQL_REPLICA_DB')}, + }, }, # ------------------------------------------------------------------ # Tap MongoDB is a REQUIRED test connector and test database with test data available @@ -80,13 +106,25 @@ def _load_env(self): 'TAP_MONGODB': { 'template_patterns': ['tap_postgres'], 'vars': { - 'HOST': {'value': os.environ.get('TAP_MONGODB_HOST'), 'required': True}, - 'PORT': {'value': os.environ.get('TAP_MONGODB_PORT'), 'required': True}, - 'USER': {'value': os.environ.get('TAP_MONGODB_USER'), 'required': True}, - 'PASSWORD': {'value': os.environ.get('TAP_MONGODB_PASSWORD'), 'required': True}, + 'HOST': { + 'value': os.environ.get('TAP_MONGODB_HOST'), + 'required': True, + }, + 'PORT': { + 'value': os.environ.get('TAP_MONGODB_PORT'), + 'required': True, + }, + 'USER': { + 'value': os.environ.get('TAP_MONGODB_USER'), + 'required': True, + }, + 'PASSWORD': { + 'value': os.environ.get('TAP_MONGODB_PASSWORD'), + 'required': True, + }, 'DB': {'value': os.environ.get('TAP_MONGODB_DB'), 'required': True}, - 'AUTH_DB': {'value': 'admin', 'required': True} - } + 'AUTH_DB': {'value': 'admin', 'required': True}, + }, }, # ------------------------------------------------------------------ # Tap S3 CSV is an OPTIONAL test connector and it requires credentials to a real S3 bucket. @@ -96,10 +134,12 @@ def _load_env(self): 'optional': True, 'template_patterns': ['tap_s3_csv'], 'vars': { - 'AWS_KEY' : {'value': os.environ.get('TAP_S3_CSV_AWS_KEY')}, - 'AWS_SECRET_ACCESS_KEY' : {'value': os.environ.get('TAP_S3_CSV_AWS_SECRET_ACCESS_KEY')}, - 'BUCKET' : {'value': os.environ.get('TAP_S3_CSV_BUCKET')}, - } + 'AWS_KEY': {'value': os.environ.get('TAP_S3_CSV_AWS_KEY')}, + 'AWS_SECRET_ACCESS_KEY': { + 'value': os.environ.get('TAP_S3_CSV_AWS_SECRET_ACCESS_KEY') + }, + 'BUCKET': {'value': os.environ.get('TAP_S3_CSV_BUCKET')}, + }, }, # ------------------------------------------------------------------ # Target Postgres is a REQUIRED test connector and test database available in the docker environment @@ -107,12 +147,12 @@ def _load_env(self): 'TARGET_POSTGRES': { 'template_patterns': ['target_postgres', 'to_pg'], 'vars': { - 'HOST' : {'value': os.environ.get('TARGET_POSTGRES_HOST')}, - 'PORT' : {'value': os.environ.get('TARGET_POSTGRES_PORT')}, - 'USER' : {'value': os.environ.get('TARGET_POSTGRES_USER')}, - 'PASSWORD' : {'value': os.environ.get('TARGET_POSTGRES_PASSWORD')}, - 'DB' : {'value': os.environ.get('TARGET_POSTGRES_DB')}, - } + 'HOST': {'value': os.environ.get('TARGET_POSTGRES_HOST')}, + 'PORT': {'value': os.environ.get('TARGET_POSTGRES_PORT')}, + 'USER': {'value': os.environ.get('TARGET_POSTGRES_USER')}, + 'PASSWORD': {'value': os.environ.get('TARGET_POSTGRES_PASSWORD')}, + 'DB': {'value': os.environ.get('TARGET_POSTGRES_DB')}, + }, }, # ------------------------------------------------------------------ # Target Snowflake is an OPTIONAL test connector because it's not open sourced and not part of @@ -123,26 +163,52 @@ def _load_env(self): 'optional': True, 'template_patterns': ['target_snowflake', 'to_sf'], 'vars': { - 'ACCOUNT' : {'value': os.environ.get('TARGET_SNOWFLAKE_ACCOUNT')}, - 'DBNAME' : {'value': os.environ.get('TARGET_SNOWFLAKE_DBNAME')}, - 'USER' : {'value': os.environ.get('TARGET_SNOWFLAKE_USER')}, - 'PASSWORD' : {'value': os.environ.get('TARGET_SNOWFLAKE_PASSWORD')}, - 'WAREHOUSE' : {'value': os.environ.get('TARGET_SNOWFLAKE_WAREHOUSE')}, - 'AWS_ACCESS_KEY' : {'value': os.environ.get('TARGET_SNOWFLAKE_AWS_ACCESS_KEY'), - 'optional': True}, - 'AWS_SECRET_ACCESS_KEY' : {'value': os.environ.get('TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY'), - 'optional': True}, - 'SESSION_TOKEN' : {'value': os.environ.get('TARGET_SNOWFLAKE_SESSION_TOKEN'), - 'optional': True}, - 'S3_BUCKET' : {'value': os.environ.get('TARGET_SNOWFLAKE_S3_BUCKET')}, - 'S3_KEY_PREFIX' : {'value': os.environ.get('TARGET_SNOWFLAKE_S3_KEY_PREFIX')}, - 'S3_ACL' : {'value': os.environ.get('TARGET_SNOWFLAKE_S3_ACL'), 'optional': True}, - 'STAGE' : {'value': os.environ.get('TARGET_SNOWFLAKE_STAGE')}, - 'FILE_FORMAT' : {'value': os.environ.get('TARGET_SNOWFLAKE_FILE_FORMAT')}, - 'CLIENT_SIDE_ENCRYPTION_MASTER_KEY': - {'value': os.environ.get('TARGET_SNOWFLAKE_CLIENT_SIDE_ENCRYPTION_MASTER_KEY'), - 'optional': True}, - } + 'ACCOUNT': {'value': os.environ.get('TARGET_SNOWFLAKE_ACCOUNT')}, + 'DBNAME': {'value': os.environ.get('TARGET_SNOWFLAKE_DBNAME')}, + 'USER': {'value': os.environ.get('TARGET_SNOWFLAKE_USER')}, + 'PASSWORD': {'value': os.environ.get('TARGET_SNOWFLAKE_PASSWORD')}, + 'WAREHOUSE': { + 'value': os.environ.get('TARGET_SNOWFLAKE_WAREHOUSE') + }, + 'AWS_ACCESS_KEY': { + 'value': os.environ.get('TARGET_SNOWFLAKE_AWS_ACCESS_KEY'), + 'optional': True, + }, + 'AWS_SECRET_ACCESS_KEY': { + 'value': os.environ.get( + 'TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY' + ), + 'optional': True, + }, + 'SESSION_TOKEN': { + 'value': os.environ.get('TARGET_SNOWFLAKE_SESSION_TOKEN'), + 'optional': True, + }, + 'S3_BUCKET': { + 'value': os.environ.get('TARGET_SNOWFLAKE_S3_BUCKET') + }, + 'S3_KEY_PREFIX': { + 'value': os.environ.get('TARGET_SNOWFLAKE_S3_KEY_PREFIX') + }, + 'S3_ACL': { + 'value': os.environ.get('TARGET_SNOWFLAKE_S3_ACL'), + 'optional': True, + }, + 'STAGE': {'value': os.environ.get('TARGET_SNOWFLAKE_STAGE')}, + 'FILE_FORMAT': { + 'value': os.environ.get('TARGET_SNOWFLAKE_FILE_FORMAT') + }, + 'CLIENT_SIDE_ENCRYPTION_MASTER_KEY': { + 'value': os.environ.get( + 'TARGET_SNOWFLAKE_CLIENT_SIDE_ENCRYPTION_MASTER_KEY' + ), + 'optional': True, + }, + 'SCHEMA_POSTFIX': { + 'value': os.environ.get('TARGET_SNOWFLAKE_SCHEMA_POSTFIX', self.sf_schema_postfix), + 'optional': True, + } + }, }, # ------------------------------------------------------------------ # Target BigQuery is an OPTIONAL test connector because it's not open sourced and not part of @@ -153,8 +219,8 @@ def _load_env(self): 'optional': True, 'template_patterns': ['target_bigquery', 'to_bq'], 'vars': { - 'PROJECT' : {'value': os.environ.get('TARGET_BIGQUERY_PROJECT')}, - } + 'PROJECT': {'value': os.environ.get('TARGET_BIGQUERY_PROJECT')}, + }, }, # ------------------------------------------------------------------ # Target Redshift is an OPTIONAL test connector because it's not open sourced and not part of @@ -165,36 +231,65 @@ def _load_env(self): 'optional': True, 'template_patterns': ['target_redshift', 'to_rs'], 'vars': { - 'HOST' : {'value': os.environ.get('TARGET_REDSHIFT_HOST')}, - 'PORT' : {'value': os.environ.get('TARGET_REDSHIFT_PORT')}, - 'USER' : {'value': os.environ.get('TARGET_REDSHIFT_USER')}, - 'PASSWORD' : {'value': os.environ.get('TARGET_REDSHIFT_PASSWORD')}, - 'DBNAME' : {'value': os.environ.get('TARGET_REDSHIFT_DBNAME')}, - 'AWS_ACCESS_KEY' : {'value': os.environ.get('TARGET_REDSHIFT_AWS_ACCESS_KEY'), - 'optional': True}, - 'AWS_SECRET_ACCESS_KEY' : {'value': os.environ.get('TARGET_REDSHIFT_AWS_SECRET_ACCESS_KEY'), - 'optional': True}, - 'SESSION_TOKEN' : {'value': os.environ.get('TARGET_REDSHIFT_SESSION_TOKEN'), - 'optional': True}, - 'COPY_ROLE_ARN' : {'value': os.environ.get('TARGET_REDSHIFT_COPY_ROLE_ARN'), - 'optional': True}, - 'S3_BUCKET' : {'value': os.environ.get('TARGET_REDSHIFT_S3_BUCKET')}, - 'S3_KEY_PREFIX' : {'value': os.environ.get('TARGET_REDSHIFT_S3_KEY_PREFIX')}, - 'S3_ACL' : {'value': os.environ.get('TARGET_REDSHIFT_S3_ACL'), 'optional': True} - } - } + 'HOST': {'value': os.environ.get('TARGET_REDSHIFT_HOST')}, + 'PORT': {'value': os.environ.get('TARGET_REDSHIFT_PORT')}, + 'USER': {'value': os.environ.get('TARGET_REDSHIFT_USER')}, + 'PASSWORD': {'value': os.environ.get('TARGET_REDSHIFT_PASSWORD')}, + 'DBNAME': {'value': os.environ.get('TARGET_REDSHIFT_DBNAME')}, + 'AWS_ACCESS_KEY': { + 'value': os.environ.get('TARGET_REDSHIFT_AWS_ACCESS_KEY'), + 'optional': True, + }, + 'AWS_SECRET_ACCESS_KEY': { + 'value': os.environ.get( + 'TARGET_REDSHIFT_AWS_SECRET_ACCESS_KEY' + ), + 'optional': True, + }, + 'SESSION_TOKEN': { + 'value': os.environ.get('TARGET_REDSHIFT_SESSION_TOKEN'), + 'optional': True, + }, + 'COPY_ROLE_ARN': { + 'value': os.environ.get('TARGET_REDSHIFT_COPY_ROLE_ARN'), + 'optional': True, + }, + 'S3_BUCKET': {'value': os.environ.get('TARGET_REDSHIFT_S3_BUCKET')}, + 'S3_KEY_PREFIX': { + 'value': os.environ.get('TARGET_REDSHIFT_S3_KEY_PREFIX') + }, + 'S3_ACL': { + 'value': os.environ.get('TARGET_REDSHIFT_S3_ACL'), + 'optional': True, + }, + }, + }, } # Add is_configured keys for every connector # Useful to skip certain test cases dynamically when specific tap # or target database is not configured - self.env['TAP_POSTGRES']['is_configured'] = self._is_env_connector_configured('TAP_POSTGRES') - self.env['TAP_MYSQL']['is_configured'] = self._is_env_connector_configured('TAP_MYSQL') - self.env['TAP_S3_CSV']['is_configured'] = self._is_env_connector_configured('TAP_S3_CSV') - self.env['TARGET_POSTGRES']['is_configured'] = self._is_env_connector_configured('TARGET_POSTGRES') - self.env['TARGET_REDSHIFT']['is_configured'] = self._is_env_connector_configured('TARGET_REDSHIFT') - self.env['TARGET_SNOWFLAKE']['is_configured'] = self._is_env_connector_configured('TARGET_SNOWFLAKE') - self.env['TARGET_BIGQUERY']['is_configured'] = self._is_env_connector_configured('TARGET_BIGQUERY') + self.env['TAP_POSTGRES']['is_configured'] = self._is_env_connector_configured( + 'TAP_POSTGRES' + ) + self.env['TAP_MYSQL']['is_configured'] = self._is_env_connector_configured( + 'TAP_MYSQL' + ) + self.env['TAP_S3_CSV']['is_configured'] = self._is_env_connector_configured( + 'TAP_S3_CSV' + ) + self.env['TARGET_POSTGRES'][ + 'is_configured' + ] = self._is_env_connector_configured('TARGET_POSTGRES') + self.env['TARGET_REDSHIFT'][ + 'is_configured' + ] = self._is_env_connector_configured('TARGET_REDSHIFT') + self.env['TARGET_SNOWFLAKE'][ + 'is_configured' + ] = self._is_env_connector_configured('TARGET_SNOWFLAKE') + self.env['TARGET_BIGQUERY'][ + 'is_configured' + ] = self._is_env_connector_configured('TARGET_BIGQUERY') def _get_conn_env_var(self, connector, key): """Get the value of a specific variable in the self.env dict""" @@ -209,14 +304,14 @@ def get_aws_session(self): aws_secret_access_key = os.environ.get('TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY') if aws_access_key_id is None or aws_secret_access_key is None: raise Exception( - 'Env vars TARGET_SNOWFLAKE_AWS_ACCESS_KEY and TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY are required') + 'Env vars TARGET_SNOWFLAKE_AWS_ACCESS_KEY and TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY are required' + ) return boto3.session.Session( aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key + aws_secret_access_key=aws_secret_access_key, ) - def _is_env_connector_configured(self, env_connector): """Detect if certain component(s) of env vars group is configured properly""" env_conns = [] @@ -235,7 +330,9 @@ def _is_env_connector_configured(self, env_connector): if self.env[env_conn].get('optional'): return False # Value not defined but it's a required property - raise Exception(f'{env_conn}_{key} env var is required but not defined.') + raise Exception( + f'{env_conn}_{key} env var is required but not defined.' + ) return True def _find_env_conn_by_template_name(self, template_name): @@ -277,7 +374,7 @@ def _init_test_project_dir(self, project_dir): templates = glob.glob(f'{project_dir}/*.yml.template') for template_path in templates: # Replace env vars in template - with open(template_path, 'r') as f_template: + with open(template_path, 'r', encoding='utf-8') as f_template: yaml = f_template.read() # Detect if every env var configured for the template @@ -290,10 +387,12 @@ def _init_test_project_dir(self, project_dir): if is_configured: template_vars = set(re.findall(r'\$\{(.+?)\}', yaml)) for var in template_vars: - yaml = yaml.replace(f'${{{var}}}', self._all_env_vars_to_dict().get(var)) + yaml = yaml.replace( + f'${{{var}}}', self._all_env_vars_to_dict().get(var) + ) # Write the template replaced YAML file - with open(yaml_path, 'w+') as f_render: + with open(yaml_path, 'w+', encoding='utf-8') as f_render: f_render.write(yaml) # Delete if exists but not configured @@ -314,40 +413,47 @@ def _run_command(args): def run_query_tap_postgres(self, query): """Run and SQL query in tap postgres database""" - return db.run_query_postgres(query, - host=self._get_conn_env_var('TAP_POSTGRES', 'HOST'), - port=self._get_conn_env_var('TAP_POSTGRES', 'PORT'), - user=self._get_conn_env_var('TAP_POSTGRES', 'USER'), - password=self._get_conn_env_var('TAP_POSTGRES', 'PASSWORD'), - database=self._get_conn_env_var('TAP_POSTGRES', 'DB')) + return db.run_query_postgres( + query, + host=self._get_conn_env_var('TAP_POSTGRES', 'HOST'), + port=self._get_conn_env_var('TAP_POSTGRES', 'PORT'), + user=self._get_conn_env_var('TAP_POSTGRES', 'USER'), + password=self._get_conn_env_var('TAP_POSTGRES', 'PASSWORD'), + database=self._get_conn_env_var('TAP_POSTGRES', 'DB'), + ) def get_tap_mongodb_connection(self): """Create and returns tap mongodb database instance to run queries on""" - return db.get_mongodb_connection(host=self._get_conn_env_var('TAP_MONGODB', 'HOST'), - port=self._get_conn_env_var('TAP_MONGODB', 'PORT'), - user=self._get_conn_env_var('TAP_MONGODB', 'USER'), - password=self._get_conn_env_var('TAP_MONGODB', 'PASSWORD'), - database=self._get_conn_env_var('TAP_MONGODB', 'DB'), - auth_database=self._get_conn_env_var('TAP_MONGODB', 'AUTH_DB'), - ) + return db.get_mongodb_connection( + host=self._get_conn_env_var('TAP_MONGODB', 'HOST'), + port=self._get_conn_env_var('TAP_MONGODB', 'PORT'), + user=self._get_conn_env_var('TAP_MONGODB', 'USER'), + password=self._get_conn_env_var('TAP_MONGODB', 'PASSWORD'), + database=self._get_conn_env_var('TAP_MONGODB', 'DB'), + auth_database=self._get_conn_env_var('TAP_MONGODB', 'AUTH_DB'), + ) def run_query_target_postgres(self, query: object) -> object: """Run and SQL query in target postgres database""" - return db.run_query_postgres(query, - host=self._get_conn_env_var('TARGET_POSTGRES', 'HOST'), - port=self._get_conn_env_var('TARGET_POSTGRES', 'PORT'), - user=self._get_conn_env_var('TARGET_POSTGRES', 'USER'), - password=self._get_conn_env_var('TARGET_POSTGRES', 'PASSWORD'), - database=self._get_conn_env_var('TARGET_POSTGRES', 'DB')) + return db.run_query_postgres( + query, + host=self._get_conn_env_var('TARGET_POSTGRES', 'HOST'), + port=self._get_conn_env_var('TARGET_POSTGRES', 'PORT'), + user=self._get_conn_env_var('TARGET_POSTGRES', 'USER'), + password=self._get_conn_env_var('TARGET_POSTGRES', 'PASSWORD'), + database=self._get_conn_env_var('TARGET_POSTGRES', 'DB'), + ) def run_query_target_redshift(self, query): """Run an SQL query in target redshift database""" - return db.run_query_redshift(query, - host=self._get_conn_env_var('TARGET_REDSHIFT', 'HOST'), - port=self._get_conn_env_var('TARGET_REDSHIFT', 'PORT'), - user=self._get_conn_env_var('TARGET_REDSHIFT', 'USER'), - password=self._get_conn_env_var('TARGET_REDSHIFT', 'PASSWORD'), - database=self._get_conn_env_var('TARGET_REDSHIFT', 'DBNAME')) + return db.run_query_redshift( + query, + host=self._get_conn_env_var('TARGET_REDSHIFT', 'HOST'), + port=self._get_conn_env_var('TARGET_REDSHIFT', 'PORT'), + user=self._get_conn_env_var('TARGET_REDSHIFT', 'USER'), + password=self._get_conn_env_var('TARGET_REDSHIFT', 'PASSWORD'), + database=self._get_conn_env_var('TARGET_REDSHIFT', 'DBNAME'), + ) # pylint: disable=unnecessary-pass def run_query_tap_s3_csv(self, file): @@ -357,31 +463,48 @@ def run_query_tap_s3_csv(self, file): def run_query_tap_mysql(self, query): """Run and SQL query in tap mysql database""" - return db.run_query_mysql(query, - host=self._get_conn_env_var('TAP_MYSQL', 'HOST'), - port=int(self._get_conn_env_var('TAP_MYSQL', 'PORT')), - user=self._get_conn_env_var('TAP_MYSQL', 'USER'), - password=self._get_conn_env_var('TAP_MYSQL', 'PASSWORD'), - database=self._get_conn_env_var('TAP_MYSQL', 'DB')) + return db.run_query_mysql( + query, + host=self._get_conn_env_var('TAP_MYSQL', 'HOST'), + port=int(self._get_conn_env_var('TAP_MYSQL', 'PORT')), + user=self._get_conn_env_var('TAP_MYSQL', 'USER'), + password=self._get_conn_env_var('TAP_MYSQL', 'PASSWORD'), + database=self._get_conn_env_var('TAP_MYSQL', 'DB'), + ) + + def run_query_tap_mysql_2(self, query): + """Run and SQL query in tap mysql database""" + return db.run_query_mysql( + query, + host=self._get_conn_env_var('TAP_MYSQL', 'HOST'), + port=int(self._get_conn_env_var('TAP_MYSQL', 'PORT')), + user=self._get_conn_env_var('TAP_MYSQL', 'USER'), + password=self._get_conn_env_var('TAP_MYSQL', 'PASSWORD'), + database=self._get_conn_env_var('TAP_MYSQL', 'DB_2'), + ) def run_query_target_snowflake(self, query): """Run and SQL query in target snowflake database""" - return db.run_query_snowflake(query, - account=self._get_conn_env_var('TARGET_SNOWFLAKE', 'ACCOUNT'), - database=self._get_conn_env_var('TARGET_SNOWFLAKE', 'DBNAME'), - warehouse=self._get_conn_env_var('TARGET_SNOWFLAKE', 'WAREHOUSE'), - user=self._get_conn_env_var('TARGET_SNOWFLAKE', 'USER'), - password=self._get_conn_env_var('TARGET_SNOWFLAKE', 'PASSWORD')) + return db.run_query_snowflake( + query, + account=self._get_conn_env_var('TARGET_SNOWFLAKE', 'ACCOUNT'), + database=self._get_conn_env_var('TARGET_SNOWFLAKE', 'DBNAME'), + warehouse=self._get_conn_env_var('TARGET_SNOWFLAKE', 'WAREHOUSE'), + user=self._get_conn_env_var('TARGET_SNOWFLAKE', 'USER'), + password=self._get_conn_env_var('TARGET_SNOWFLAKE', 'PASSWORD'), + ) def delete_dataset_target_bigquery(self, dataset): """Run and SQL query in target bigquery database""" - return db.delete_dataset_bigquery(dataset, - project=self._get_conn_env_var('TARGET_BIGQUERY', 'PROJECT')) + return db.delete_dataset_bigquery( + dataset, project=self._get_conn_env_var('TARGET_BIGQUERY', 'PROJECT') + ) def run_query_target_bigquery(self, query): """Run and SQL query in target bigquery database""" - return db.run_query_bigquery(query, - project=self._get_conn_env_var('TARGET_BIGQUERY', 'PROJECT')) + return db.run_query_bigquery( + query, project=self._get_conn_env_var('TARGET_BIGQUERY', 'PROJECT') + ) # ------------------------------------------------------------------------- # Setup methods to initialise source and target databases and to make them @@ -407,16 +530,23 @@ def setup_tap_mongodb(self): db_script = os.path.join(DIR, '..', '..', 'db', 'tap_mongodb.sh') self._run_command(db_script) - def setup_tap_s3_csv(self): """Upload test input files to S3 to be prapared for test run""" - mock_data_1 = os.path.join(DIR, '..', 'test-project', 's3_mock_data', 'mock_data_1.csv') - mock_data_2 = os.path.join(DIR, '..', 'test-project', 's3_mock_data', 'mock_data_2.csv') + mock_data_1 = os.path.join( + DIR, '..', 'test-project', 's3_mock_data', 'mock_data_1.csv' + ) + mock_data_2 = os.path.join( + DIR, '..', 'test-project', 's3_mock_data', 'mock_data_2.csv' + ) bucket = self._get_conn_env_var('TAP_S3_CSV', 'BUCKET') - s3 = boto3.client('s3', - aws_access_key_id=self._get_conn_env_var('TAP_S3_CSV', 'AWS_KEY'), - aws_secret_access_key=self._get_conn_env_var('TAP_S3_CSV', 'AWS_SECRET_ACCESS_KEY')) + s3 = boto3.client( + 's3', + aws_access_key_id=self._get_conn_env_var('TAP_S3_CSV', 'AWS_KEY'), + aws_secret_access_key=self._get_conn_env_var( + 'TAP_S3_CSV', 'AWS_SECRET_ACCESS_KEY' + ), + ) s3.upload_file(mock_data_1, bucket, 'ppw_e2e_tap_s3_csv/mock_data_1.csv') s3.upload_file(mock_data_2, bucket, 'ppw_e2e_tap_s3_csv/mock_data_2.csv') @@ -424,28 +554,59 @@ def setup_tap_s3_csv(self): def setup_target_postgres(self): """Clean postgres target database and prepare for test run""" self.run_query_target_postgres('CREATE EXTENSION IF NOT EXISTS pgcrypto') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres CASCADE') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_public2 CASCADE') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical1 CASCADE') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical2 CASCADE') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql CASCADE') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_s3_csv CASCADE') - self.run_query_target_postgres('DROP SCHEMA IF EXISTS ppw_e2e_tap_mongodb CASCADE') + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres CASCADE' + ) + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_public2 CASCADE' + ) + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical1 CASCADE' + ) + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical2 CASCADE' + ) + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql CASCADE' + ) + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql_2 CASCADE' + ) + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_s3_csv CASCADE' + ) + self.run_query_target_postgres( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_mongodb CASCADE' + ) # Clean config directory shutil.rmtree(os.path.join(CONFIG_DIR, 'postgres_dwh'), ignore_errors=True) def setup_target_redshift(self): """Clean redshift target database and prepare for test run""" - self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres CASCADE') - self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_public2 CASCADE') - self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical1 CASCADE') - self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical2 CASCADE') - self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql CASCADE') - self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_tap_s3_csv CASCADE') + self.run_query_target_redshift( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres CASCADE' + ) + self.run_query_target_redshift( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_public2 CASCADE' + ) + self.run_query_target_redshift( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical1 CASCADE' + ) + self.run_query_target_redshift( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical2 CASCADE' + ) + self.run_query_target_redshift( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql CASCADE' + ) + self.run_query_target_redshift( + 'DROP SCHEMA IF EXISTS ppw_e2e_tap_s3_csv CASCADE' + ) self.run_query_target_redshift('DROP SCHEMA IF EXISTS ppw_e2e_helper CASCADE') self.run_query_target_redshift('CREATE SCHEMA ppw_e2e_helper') - self.run_query_target_redshift('CREATE TABLE ppw_e2e_helper.dual (dummy VARCHAR)') + self.run_query_target_redshift( + 'CREATE TABLE ppw_e2e_helper.dual (dummy VARCHAR)' + ) self.run_query_target_redshift('INSERT INTO ppw_e2e_helper.dual VALUES (\'X\')') # Clean config directory @@ -453,13 +614,32 @@ def setup_target_redshift(self): def setup_target_snowflake(self): """Clean snowflake target database and prepare for test run""" - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres CASCADE') - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_public2 CASCADE') - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical1 CASCADE') - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical2 CASCADE') - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql CASCADE') - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_s3_csv CASCADE') - self.run_query_target_snowflake('DROP SCHEMA IF EXISTS ppw_e2e_tap_mongodb CASCADE') + + if self.env['TARGET_SNOWFLAKE']['is_configured']: + self.run_query_target_snowflake( + f'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres{self.sf_schema_postfix} CASCADE' + ) + self.run_query_target_snowflake( + f'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_public2{self.sf_schema_postfix} CASCADE' + ) + self.run_query_target_snowflake( + f'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical1{self.sf_schema_postfix} CASCADE' + ) + self.run_query_target_snowflake( + f'DROP SCHEMA IF EXISTS ppw_e2e_tap_postgres_logical2{self.sf_schema_postfix} CASCADE' + ) + self.run_query_target_snowflake( + f'DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql{self.sf_schema_postfix} CASCADE' + ) + self.run_query_target_postgres( + f'DROP SCHEMA IF EXISTS ppw_e2e_tap_mysql_2{self.sf_schema_postfix} CASCADE' + ) + self.run_query_target_snowflake( + f'DROP SCHEMA IF EXISTS ppw_e2e_tap_s3_csv{self.sf_schema_postfix} CASCADE' + ) + self.run_query_target_snowflake( + f'DROP SCHEMA IF EXISTS ppw_e2e_tap_mongodb{self.sf_schema_postfix} CASCADE' + ) # Clean config directory shutil.rmtree(os.path.join(CONFIG_DIR, 'snowflake'), ignore_errors=True) @@ -473,3 +653,9 @@ def setup_target_bigquery(self): self.delete_dataset_target_bigquery('ppw_e2e_tap_mysql') self.delete_dataset_target_bigquery('ppw_e2e_tap_s3_csv') self.delete_dataset_target_bigquery('ppw_e2e_tap_mongodb') + + @staticmethod + def remove_all_state_files(): + """Clean up state files to ensure tests behave the same every time""" + for state_file in Path(CONFIG_DIR).glob('**/state.json'): + state_file.unlink() diff --git a/tests/end_to_end/helpers/tasks.py b/tests/end_to_end/helpers/tasks.py index 8717a92dd..9d8c968b3 100644 --- a/tests/end_to_end/helpers/tasks.py +++ b/tests/end_to_end/helpers/tasks.py @@ -5,7 +5,9 @@ def run_command(command): """Run shell command and return returncode, stdout and stderr""" - with subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc: + with subprocess.Popen( + shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) as proc: proc_result = proc.communicate() return_code = proc.returncode stdout = proc_result[0].decode('utf-8') @@ -16,12 +18,12 @@ def run_command(command): def find_run_tap_log_file(stdout, sync_engine=None): """Pipelinewise creates log file per running tap instances in a dynamically created directory: - ~/.pipelinewise///log + ~/.pipelinewise///log - Every log file matches the pattern: - --_