From e72746821dd09ea8ad3b63e584b2f59a05c1019c Mon Sep 17 00:00:00 2001 From: Joachim Praetorius Date: Wed, 29 May 2024 12:25:20 +0200 Subject: [PATCH] Issues/122 support specifying tables for glue import (#230) * Add a glue-table parameter to filter tables imported from AWS - works the same way as it does for `bigquery` - the user needs to supply the names, can give the parameter multiple times - there's no validation of the names, so typos will lead to runtime errors * Adapt tests --- CHANGELOG.md | 1 + README.md | 86 +++++++++++++------ datacontract/cli.py | 8 +- datacontract/data_contract.py | 3 +- datacontract/imports/glue_importer.py | 7 +- .../glue/datacontract-empty-model.yaml | 15 ++++ tests/test_import_glue.py | 32 +++++++ 7 files changed, 120 insertions(+), 32 deletions(-) create mode 100644 tests/fixtures/glue/datacontract-empty-model.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index f55cec9a..3aba5c79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `datacontract export --format avro`: Now supports config map on field level for logicalTypes and default values [Custom Avro Properties](./README.md#custom-avro-properties) - `datacontract import --format avro`: Now supports importing logicalType and default definition on avro files [Custom Avro Properties](./README.md#custom-avro-properties) - Support `config.bigqueryType` for testing BigQuery types +- Added support for selecting specific tables in an AWS Glue `import` through the `glue-table` parameter (#122) ### Fixed diff --git a/README.md b/README.md index f8fb145f..03cd82a2 100644 --- a/README.md +++ b/README.md @@ -745,41 +745,30 @@ models: ``` Usage: datacontract import [OPTIONS] - Create a data contract from the given source location. Prints to stdout. - -╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ * --format [sql|avro|glue|bigquery|jsonschema] The format of the source file. [default: None] [required] │ -│ --source TEXT The path to the file or Glue Database that should be imported. [default: None] │ -│ --bigquery-project TEXT The bigquery project id. [default: None] │ -│ --bigquery-dataset TEXT The bigquery dataset id. [default: None] │ -│ --bigquery-table TEXT List of table ids to import from the bigquery API (repeat for multiple table ids, leave empty for all │ -│ tables in the dataset). │ -│ [default: None] │ -│ --help Show this message and exit. │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + Create a data contract from the given source location. Prints to stdout. + +╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --format [sql|avro|glue|bigquery|jsonschema] The format of the source file. [default: None] [required] │ +│ --source TEXT The path to the file or Glue Database that should be imported. │ +│ [default: None] │ +│ --glue-table TEXT List of table ids to import from the Glue Database (repeat for │ +│ multiple table ids, leave empty for all tables in the dataset). │ +│ [default: None] │ +│ --bigquery-project TEXT The bigquery project id. [default: None] │ +│ --bigquery-dataset TEXT The bigquery dataset id. [default: None] │ +│ --bigquery-table TEXT List of table ids to import from the bigquery API (repeat for │ +│ multiple table ids, leave empty for all tables in the dataset). │ +│ [default: None] │ +│ --help Show this message and exit. │ +╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` -As shown, some options are only relevant in certain conditions: For `format` Bigtable we support to directly read off the Bigtable APIs. -In this case there's no need to specify `source` but instead `bt-project-id`, `bt-dataset-id` and `table` must be specified. - -For providing authentication to the Client, please see [the google documentation](https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to) or the one [about authorizing client libraries](https://cloud.google.com/bigquery/docs/authentication#client-libs). - Example: ```bash # Example import from SQL DDL datacontract import --format sql --source my_ddl.sql ``` -```bash -# Example import from Bigquery JSON -datacontract import --format bigquery --source my_bigquery_table.json -``` - -```bash -# Example import from Bigquery API -datacontract import --format bigquery --btProjectId --btDatasetId --table --table --table -``` - Available import options: | Type | Description | Status | @@ -795,6 +784,49 @@ Available import options: | Missing something? | Please create an issue on GitHub | TBD | +#### BigQuery + +Bigquery data can either be imported off of JSON Files generated from the table descriptions or directly from the Bigquery API. In case you want to use JSON Files, specify the `source` parameter with a path to the JSON File. + +To import from the Bigquery API, you have to _omit_ `source` and instead need to provide `bigquery-project` and `bigquery-dataset`. Additionally you may specify `bigquery-table` to enumerate the tables that should be imported. If no tables are given, _all_ available tables of the dataset will be imported. + +For providing authentication to the Client, please see [the google documentation](https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to) or the one [about authorizing client libraries](https://cloud.google.com/bigquery/docs/authentication#client-libs). + +Examples: + +```bash +# Example import from Bigquery JSON +datacontract import --format bigquery --source my_bigquery_table.json +``` + +```bash +# Example import from Bigquery API with specifying the tables to import +datacontract import --format bigquery --bigquery-project --bigquery-dataset --bigquery-table --bigquery-table --bigquery-table +``` + +```bash +# Example import from Bigquery API importing all tables in the dataset +datacontract import --format bigquery --bigquery-project --bigquery-dataset +``` + +### Glue + +Importing from Glue reads the necessary Data directly off of the AWS API. +You may give the `glue-table` parameter to enumerate the tables that should be imported. If no tables are given, _all_ available tables of the database will be imported. + +Examples: + +```bash +# Example import from AWS Glue with specifying the tables to import +datacontract import --format glue --source --glue-table --glue-table --glue-table +``` + +```bash +# Example import from AWS Glue importing all tables in the database +datacontract import --format glue --source +``` + + ### breaking ``` diff --git a/datacontract/cli.py b/datacontract/cli.py index 9f9db1f2..45524cdc 100644 --- a/datacontract/cli.py +++ b/datacontract/cli.py @@ -231,6 +231,12 @@ def import_( source: Annotated[ Optional[str], typer.Option(help="The path to the file or Glue Database that should be imported.") ] = None, + glue_table: Annotated[ + Optional[List[str]], + typer.Option( + help="List of table ids to import from the Glue Database (repeat for multiple table ids, leave empty for all tables in the dataset)." + ), + ] = None, bigquery_project: Annotated[Optional[str], typer.Option(help="The bigquery project id.")] = None, bigquery_dataset: Annotated[Optional[str], typer.Option(help="The bigquery dataset id.")] = None, bigquery_table: Annotated[ @@ -243,7 +249,7 @@ def import_( """ Create a data contract from the given source location. Prints to stdout. """ - result = DataContract().import_from_source(format, source, bigquery_table, bigquery_project, bigquery_dataset) + result = DataContract().import_from_source(format, source, glue_table, bigquery_table, bigquery_project, bigquery_dataset) console.print(result.to_yaml()) diff --git a/datacontract/data_contract.py b/datacontract/data_contract.py index 46997bf4..3e0c33a7 100644 --- a/datacontract/data_contract.py +++ b/datacontract/data_contract.py @@ -422,6 +422,7 @@ def import_from_source( self, format: str, source: typing.Optional[str] = None, + glue_tables: typing.Optional[typing.List[str]] = None, bigquery_tables: typing.Optional[typing.List[str]] = None, bigquery_project: typing.Optional[str] = None, bigquery_dataset: typing.Optional[str] = None, @@ -433,7 +434,7 @@ def import_from_source( elif format == "avro": data_contract_specification = import_avro(data_contract_specification, source) elif format == "glue": - data_contract_specification = import_glue(data_contract_specification, source) + data_contract_specification = import_glue(data_contract_specification, source, glue_tables) elif format == "jsonschema": data_contract_specification = import_jsonschema(data_contract_specification, source) elif format == "bigquery": diff --git a/datacontract/imports/glue_importer.py b/datacontract/imports/glue_importer.py index df2438d7..c17d6795 100644 --- a/datacontract/imports/glue_importer.py +++ b/datacontract/imports/glue_importer.py @@ -107,7 +107,7 @@ def get_glue_table_schema(database_name: str, table_name: str): return table_schema -def import_glue(data_contract_specification: DataContractSpecification, source: str): +def import_glue(data_contract_specification: DataContractSpecification, source: str, table_names: List[str]): """Import the schema of a Glue database.""" catalogid, location_uri = get_glue_database(source) @@ -116,13 +116,14 @@ def import_glue(data_contract_specification: DataContractSpecification, source: if catalogid is None: return data_contract_specification - tables = get_glue_tables(source) + if table_names is None: + table_names = get_glue_tables(source) data_contract_specification.servers = { "production": Server(type="glue", account=catalogid, database=source, location=location_uri), } - for table_name in tables: + for table_name in table_names: if data_contract_specification.models is None: data_contract_specification.models = {} diff --git a/tests/fixtures/glue/datacontract-empty-model.yaml b/tests/fixtures/glue/datacontract-empty-model.yaml new file mode 100644 index 00000000..3fb53446 --- /dev/null +++ b/tests/fixtures/glue/datacontract-empty-model.yaml @@ -0,0 +1,15 @@ +dataContractSpecification: 0.9.3 +id: my-data-contract-id +info: + title: My Data Contract + version: 0.0.1 +servers: + production: + account: '123456789012' + database: test_database + location: s3://test_bucket/testdb + type: glue +models: + table_1: + type: table + \ No newline at end of file diff --git a/tests/test_import_glue.py b/tests/test_import_glue.py index 0420a904..855ed3bb 100644 --- a/tests/test_import_glue.py +++ b/tests/test_import_glue.py @@ -84,6 +84,25 @@ def test_cli(setup_mock_glue): ) assert result.exit_code == 0 +@mock_aws +def test_cli_with_table_filters(setup_mock_glue): + runner = CliRunner() + result = runner.invoke( + app, + [ + "import", + "--format", + "glue", + "--source", + "test_database", + "--glue-table", + "table_1", + "--glue-table", + "table_2", + ], + ) + assert result.exit_code == 0 + @mock_aws def test_import_glue_schema(setup_mock_glue): @@ -96,3 +115,16 @@ def test_import_glue_schema(setup_mock_glue): assert yaml.safe_load(result.to_yaml()) == yaml.safe_load(expected) # Disable linters so we don't get "missing description" warnings assert DataContract(data_contract_str=expected).lint(enabled_linters=set()).has_passed() + +@mock_aws +def test_import_glue_schema_with_table_filters(setup_mock_glue): + result = DataContract().import_from_source("glue", "test_database", ["table_1"]) + + # we specify a table that the Mock doesn't have and thus expect an empty result + with open("fixtures/glue/datacontract-empty-model.yaml") as file: + expected = file.read() + + print("Result", result.to_yaml()) + assert yaml.safe_load(result.to_yaml()) == yaml.safe_load(expected) + # Disable linters so we don't get "missing description" warnings + assert DataContract(data_contract_str=expected).lint(enabled_linters=set()).has_passed()