From 69ff9c3af3da11deb6f915f11820b2489caac6e0 Mon Sep 17 00:00:00 2001 From: John Joyce Date: Thu, 25 Jan 2024 16:51:40 -0800 Subject: [PATCH 01/14] infra(ui): Add a react context provider allowing sub-components to update theme conf (#9674) Co-authored-by: John Joyce --- datahub-web-react/src/App.tsx | 34 +++++-------------- datahub-web-react/src/CustomThemeProvider.tsx | 32 +++++++++++++++++ datahub-web-react/src/customThemeContext.tsx | 10 ++++++ 3 files changed, 50 insertions(+), 26 deletions(-) create mode 100644 datahub-web-react/src/CustomThemeProvider.tsx create mode 100644 datahub-web-react/src/customThemeContext.tsx diff --git a/datahub-web-react/src/App.tsx b/datahub-web-react/src/App.tsx index 79c9ee91ceaa12..e8910e7dc2ea8e 100644 --- a/datahub-web-react/src/App.tsx +++ b/datahub-web-react/src/App.tsx @@ -1,20 +1,19 @@ -import React, { useEffect, useState } from 'react'; +import React from 'react'; import Cookies from 'js-cookie'; import { message } from 'antd'; import { BrowserRouter as Router } from 'react-router-dom'; import { ApolloClient, ApolloProvider, createHttpLink, InMemoryCache, ServerError } from '@apollo/client'; import { onError } from '@apollo/client/link/error'; -import { ThemeProvider } from 'styled-components'; import { Helmet, HelmetProvider } from 'react-helmet-async'; import './App.less'; import { Routes } from './app/Routes'; -import { Theme } from './conf/theme/types'; -import defaultThemeConfig from './conf/theme/theme_light.config.json'; import { PageRoutes } from './conf/Global'; import { isLoggedInVar } from './app/auth/checkAuthStatus'; import { GlobalCfg } from './conf'; import possibleTypesResult from './possibleTypes.generated'; import { ErrorCodes } from './app/shared/constants'; +import CustomThemeProvider from './CustomThemeProvider'; +import { useCustomTheme } from './customThemeContext'; /* Construct Apollo Client @@ -71,33 +70,16 @@ const client = new ApolloClient({ }); export const InnerApp: React.VFC = () => { - const [dynamicThemeConfig, setDynamicThemeConfig] = useState(defaultThemeConfig); - - useEffect(() => { - if (import.meta.env.DEV) { - import(/* @vite-ignore */ `./conf/theme/${import.meta.env.REACT_APP_THEME_CONFIG}`).then((theme) => { - setDynamicThemeConfig(theme); - }); - } else { - // Send a request to the server to get the theme config. - fetch(`/assets/conf/theme/${import.meta.env.REACT_APP_THEME_CONFIG}`) - .then((response) => response.json()) - .then((theme) => { - setDynamicThemeConfig(theme); - }); - } - }, []); - return ( - - {dynamicThemeConfig.content.title} - - + + + {useCustomTheme().theme?.content.title} + - + ); }; diff --git a/datahub-web-react/src/CustomThemeProvider.tsx b/datahub-web-react/src/CustomThemeProvider.tsx new file mode 100644 index 00000000000000..f2e2678a90d8c4 --- /dev/null +++ b/datahub-web-react/src/CustomThemeProvider.tsx @@ -0,0 +1,32 @@ +import React, { useEffect, useState } from 'react'; +import { ThemeProvider } from 'styled-components'; +import { Theme } from './conf/theme/types'; +import defaultThemeConfig from './conf/theme/theme_light.config.json'; +import { CustomThemeContext } from './customThemeContext'; + +const CustomThemeProvider = ({ children }: { children: React.ReactNode }) => { + const [currentTheme, setTheme] = useState(defaultThemeConfig); + + useEffect(() => { + if (import.meta.env.DEV) { + import(/* @vite-ignore */ `./conf/theme/${import.meta.env.REACT_APP_THEME_CONFIG}`).then((theme) => { + setTheme(theme); + }); + } else { + // Send a request to the server to get the theme config. + fetch(`/assets/conf/theme/${import.meta.env.REACT_APP_THEME_CONFIG}`) + .then((response) => response.json()) + .then((theme) => { + setTheme(theme); + }); + } + }, []); + + return ( + + {children} + + ); +}; + +export default CustomThemeProvider; diff --git a/datahub-web-react/src/customThemeContext.tsx b/datahub-web-react/src/customThemeContext.tsx new file mode 100644 index 00000000000000..0b273d00248853 --- /dev/null +++ b/datahub-web-react/src/customThemeContext.tsx @@ -0,0 +1,10 @@ +import React, { useContext } from 'react'; + +export const CustomThemeContext = React.createContext<{ + theme: any; + updateTheme: (theme: any) => void; +}>({ theme: undefined, updateTheme: (_) => null }); + +export function useCustomTheme() { + return useContext(CustomThemeContext); +} From f7f0b14f376cad8aa3951efd305fcd15a1f01966 Mon Sep 17 00:00:00 2001 From: tom Date: Fri, 26 Jan 2024 02:51:41 +0100 Subject: [PATCH 02/14] fix(ingestion/metabase): Fetch Dashboards through Collections (#9631) Co-authored-by: Harshal Sheth --- metadata-ingestion/developing.md | 2 +- .../docs/sources/metabase/metabase.md | 2 +- .../src/datahub/ingestion/source/metabase.py | 47 +- .../metabase/metabase_mces_golden.json | 61 +- .../metabase/setup/collection_dashboards.json | 1 + .../metabase/setup/collections.json | 1 + .../integration/metabase/setup/dashboard.json | 40 - .../metabase/setup/dashboard_1.json | 1084 ++++++++++++----- .../integration/metabase/test_metabase.py | 8 +- 9 files changed, 901 insertions(+), 345 deletions(-) create mode 100644 metadata-ingestion/tests/integration/metabase/setup/collection_dashboards.json create mode 100644 metadata-ingestion/tests/integration/metabase/setup/collections.json delete mode 100644 metadata-ingestion/tests/integration/metabase/setup/dashboard.json diff --git a/metadata-ingestion/developing.md b/metadata-ingestion/developing.md index d1eef21974f1df..fc3a689124b2c1 100644 --- a/metadata-ingestion/developing.md +++ b/metadata-ingestion/developing.md @@ -10,7 +10,7 @@ Also take a look at the guide to [adding a source](./adding-source.md). ### Requirements 1. Python 3.7+ must be installed in your host environment. -2. Java8 (gradle won't work with newer versions) +2. Java 17 (gradle won't work with newer or older versions) 4. On Debian/Ubuntu: `sudo apt install python3-dev python3-venv` 5. On Fedora (if using LDAP source integration): `sudo yum install openldap-devel` diff --git a/metadata-ingestion/docs/sources/metabase/metabase.md b/metadata-ingestion/docs/sources/metabase/metabase.md index a76786f7e5853a..68422b8decce95 100644 --- a/metadata-ingestion/docs/sources/metabase/metabase.md +++ b/metadata-ingestion/docs/sources/metabase/metabase.md @@ -19,4 +19,4 @@ The key in this map must be string, not integer although Metabase API provides If `database_id_to_instance_map` is not specified, `platform_instance_map` is used for platform instance mapping. If none of the above are specified, platform instance is not used when constructing `urn` when searching for dataset relations. ## Compatibility -Metabase version [v0.41.2](https://www.metabase.com/start/oss/) +Metabase version [v0.48.3](https://www.metabase.com/start/oss/) diff --git a/metadata-ingestion/src/datahub/ingestion/source/metabase.py b/metadata-ingestion/src/datahub/ingestion/source/metabase.py index af41a74f311f64..d22bfb2b8b52ff 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metabase.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metabase.py @@ -90,10 +90,17 @@ class MetabaseSource(Source): """ This plugin extracts Charts, dashboards, and associated metadata. This plugin is in beta and has only been tested on PostgreSQL and H2 database. - ### Dashboard - [/api/dashboard](https://www.metabase.com/docs/latest/api-documentation.html#dashboard) endpoint is used to - retrieve the following dashboard information. + ### Collection + + [/api/collection](https://www.metabase.com/docs/latest/api/collection) endpoint is used to + retrieve the available collections. + + [/api/collection//items?models=dashboard](https://www.metabase.com/docs/latest/api/collection#get-apicollectioniditems) endpoint is used to retrieve a given collection and list their dashboards. + + ### Dashboard + + [/api/dashboard/](https://www.metabase.com/docs/latest/api/dashboard) endpoint is used to retrieve a given Dashboard and grab its information. - Title and description - Last edited by @@ -187,19 +194,29 @@ def close(self) -> None: def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]: try: - dashboard_response = self.session.get( - f"{self.config.connect_uri}/api/dashboard" + collections_response = self.session.get( + f"{self.config.connect_uri}/api/collection/" ) - dashboard_response.raise_for_status() - dashboards = dashboard_response.json() + collections_response.raise_for_status() + collections = collections_response.json() - for dashboard_info in dashboards: - dashboard_snapshot = self.construct_dashboard_from_api_data( - dashboard_info + for collection in collections: + collection_dashboards_response = self.session.get( + f"{self.config.connect_uri}/api/collection/{collection['id']}/items?models=dashboard" ) - if dashboard_snapshot is not None: - mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot) - yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce) + collection_dashboards_response.raise_for_status() + collection_dashboards = collection_dashboards_response.json() + + if not collection_dashboards.get("data"): + continue + + for dashboard_info in collection_dashboards.get("data"): + dashboard_snapshot = self.construct_dashboard_from_api_data( + dashboard_info + ) + if dashboard_snapshot is not None: + mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot) + yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce) except HTTPError as http_error: self.report.report_failure( @@ -254,10 +271,10 @@ def construct_dashboard_from_api_data( ) chart_urns = [] - cards_data = dashboard_details.get("ordered_cards", "{}") + cards_data = dashboard_details.get("dashcards", {}) for card_info in cards_data: chart_urn = builder.make_chart_urn( - self.platform, card_info.get("card_id", "") + self.platform, card_info.get("card").get("id", "") ) chart_urns.append(chart_urn) diff --git a/metadata-ingestion/tests/integration/metabase/metabase_mces_golden.json b/metadata-ingestion/tests/integration/metabase/metabase_mces_golden.json index 9b143348fdf603..10c1c312a4d1c7 100644 --- a/metadata-ingestion/tests/integration/metabase/metabase_mces_golden.json +++ b/metadata-ingestion/tests/integration/metabase/metabase_mces_golden.json @@ -191,20 +191,73 @@ "description": "", "charts": [ "urn:li:chart:(metabase,1)", - "urn:li:chart:(metabase,2)" + "urn:li:chart:(metabase,2)", + "urn:li:chart:(metabase,3)" ], "datasets": [], "lastModified": { "created": { - "time": 1639417721742, + "time": 1705398694904, "actor": "urn:li:corpuser:admin@metabase.com" }, "lastModified": { - "time": 1639417721742, + "time": 1705398694904, "actor": "urn:li:corpuser:admin@metabase.com" } }, - "dashboardUrl": "http://localhost:3000/dashboard/1" + "dashboardUrl": "http://localhost:3000/dashboard/10" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:admin@metabase.com", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1636614000000, + "runId": "metabase-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { + "urn": "urn:li:dashboard:(metabase,1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dashboard.DashboardInfo": { + "customProperties": {}, + "title": "Dashboard 1", + "description": "", + "charts": [ + "urn:li:chart:(metabase,1)", + "urn:li:chart:(metabase,2)", + "urn:li:chart:(metabase,3)" + ], + "datasets": [], + "lastModified": { + "created": { + "time": 1705398694904, + "actor": "urn:li:corpuser:admin@metabase.com" + }, + "lastModified": { + "time": 1705398694904, + "actor": "urn:li:corpuser:admin@metabase.com" + } + }, + "dashboardUrl": "http://localhost:3000/dashboard/10" } }, { diff --git a/metadata-ingestion/tests/integration/metabase/setup/collection_dashboards.json b/metadata-ingestion/tests/integration/metabase/setup/collection_dashboards.json new file mode 100644 index 00000000000000..b602d2dfb7dcda --- /dev/null +++ b/metadata-ingestion/tests/integration/metabase/setup/collection_dashboards.json @@ -0,0 +1 @@ +{"total": 1, "data": [{"description": null, "collection_position": null, "database_id": null, "name": "This is a test", "id": 10, "entity_id": "Q4gEaOmoBkfQX3_gXiH9g", "last-edit-info": {"id": 14, "last_name": "Doe", "first_name": "John", "email": "john.doe@somewhere.com", "timestamp": "2024-01-12T14:55:38.43304Z"}, "model": "dashboard"}], "models": ["dashboard"], "limit": null, "offset": null} diff --git a/metadata-ingestion/tests/integration/metabase/setup/collections.json b/metadata-ingestion/tests/integration/metabase/setup/collections.json new file mode 100644 index 00000000000000..a8a98c4e6d62ee --- /dev/null +++ b/metadata-ingestion/tests/integration/metabase/setup/collections.json @@ -0,0 +1 @@ +[{"authority_level": null, "can_write": true, "name": "Our analytics", "effective_ancestors": [], "effective_location": null, "parent_id": null, "id": "root", "is_personal": false}, {"authority_level": null, "description": null, "archived": false, "slug": "john_doe_personal_collection", "can_write": true, "name": "John Doe", "personal_owner_id": 14, "type": null, "id": 150, "entity_id": "kdLA_-CQy4F5lL15k8-TU", "location": "/", "namespace": null, "is_personal": true, "created_at": "2024-01-12T11:51:24.394309Z"}] diff --git a/metadata-ingestion/tests/integration/metabase/setup/dashboard.json b/metadata-ingestion/tests/integration/metabase/setup/dashboard.json deleted file mode 100644 index 095abf1bbdc6d5..00000000000000 --- a/metadata-ingestion/tests/integration/metabase/setup/dashboard.json +++ /dev/null @@ -1,40 +0,0 @@ -[{ - "description": null, - "archived": false, - "collection_position": null, - "creator": { - "email": "admin@metabase.com", - "first_name": "FirstName", - "last_login": "2021-12-13T18:51:32.999", - "is_qbnewb": true, - "is_superuser": true, - "id": 1, - "last_name": "LastName", - "date_joined": "2021-12-13T07:34:21.806", - "common_name": "FirstName LastName" - }, - "enable_embedding": false, - "collection_id": null, - "show_in_getting_started": false, - "name": "Dashboard 1", - "caveats": null, - "creator_id": 1, - "updated_at": "2021-12-13T17:48:41.735", - "made_public_by_id": null, - "embedding_params": null, - "cache_ttl": null, - "id": 1, - "position": null, - "last-edit-info": { - "id": 1, - "email": "admin@metabase.com", - "first_name": "FirstName", - "last_name": "LastName", - "timestamp": "2021-12-13T17:48:41.742" - }, - "parameters": [], - "favorite": false, - "created_at": "2021-12-13T17:46:48.185", - "public_uuid": null, - "points_of_interest": null -}] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/metabase/setup/dashboard_1.json b/metadata-ingestion/tests/integration/metabase/setup/dashboard_1.json index 288087a67da6dd..e968093c438508 100644 --- a/metadata-ingestion/tests/integration/metabase/setup/dashboard_1.json +++ b/metadata-ingestion/tests/integration/metabase/setup/dashboard_1.json @@ -2,332 +2,854 @@ "description": null, "archived": false, "collection_position": null, - "ordered_cards": [{ - "sizeX": 4, - "series": [], - "collection_authority_level": null, - "card": { - "description": null, - "archived": false, - "collection_position": null, - "table_id": null, - "result_metadata": [{ - "name": "customer_id", - "display_name": "customer_id", - "base_type": "type/Integer", - "effective_type": "type/Integer", - "field_ref": ["field", "customer_id", { - "base-type": "type/Integer" - }], - "semantic_type": null, - "fingerprint": { - "global": { - "distinct-count": 517, - "nil%": 0.0 + "dashcards": [ + { + "size_x": 12, + "dashboard_tab_id": null, + "series": [], + "action_id": null, + "collection_authority_level": null, + "card": { + "description": null, + "archived": false, + "collection_position": null, + "table_id": null, + "result_metadata": [ + { + "display_name": "EVENT_DATE", + "field_ref": [ + "field", + "EVENT_DATE", + { + "base-type": "type/Date" + } + ], + "name": "EVENT_DATE", + "base_type": "type/Date", + "effective_type": "type/Date", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 28, + "nil%": 0 + }, + "type": { + "type/DateTime": { + "earliest": "2023-12-04T00:00:00Z", + "latest": "2024-01-15T00:00:00Z" + } + } + } }, - "type": { - "type/Number": { - "min": 1.0, - "q1": 127.95550051624855, - "q3": 457.48181481488376, - "max": 599.0, - "sd": 183.35453319901166, - "avg": 293.316 + { + "display_name": "AND_VIEWERS", + "field_ref": [ + "field", + "AND_VIEWERS", + { + "base-type": "type/Number" + } + ], + "name": "AND_VIEWERS", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 28, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 4720, + "q1": 5083.5, + "q3": 9003, + "max": 10560, + "sd": 2090.2420089751945, + "avg": 6688.214285714285 + } + } } - } - } - }, { - "name": "first_name", - "display_name": "first_name", - "base_type": "type/Text", - "effective_type": "type/Text", - "field_ref": ["field", "first_name", { - "base-type": "type/Text" - }], - "semantic_type": "type/Name", - "fingerprint": { - "global": { - "distinct-count": 509, - "nil%": 0.0 }, - "type": { - "type/Text": { - "percent-json": 0.0, - "percent-url": 0.0, - "percent-email": 0.0, - "percent-state": 0.0035, - "average-length": 5.629 + { + "display_name": "AND_REDACTED", + "field_ref": [ + "field", + "AND_REDACTED", + { + "base-type": "type/Number" + } + ], + "name": "AND_REDACTED", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 28, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 948, + "q1": 2019.5, + "q3": 2500.5, + "max": 3180, + "sd": 460.56365857271413, + "avg": 2251.0714285714284 + } + } } - } - } - }, { - "name": "last_name", - "display_name": "last_name", - "base_type": "type/Text", - "effective_type": "type/Text", - "field_ref": ["field", "last_name", { - "base-type": "type/Text" - }], - "semantic_type": "type/Name", - "fingerprint": { - "global": { - "distinct-count": 517, - "nil%": 0.0 }, - "type": { - "type/Text": { - "percent-json": 0.0, - "percent-url": 0.0, - "percent-email": 0.0, - "percent-state": 0.0015, - "average-length": 6.126 + { + "display_name": "AND_REDACTED", + "field_ref": [ + "field", + "AND_REDACTED", + { + "base-type": "type/Number" + } + ], + "name": "AND_REDACTED", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 28, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 3545, + "q1": 10909, + "q3": 13916, + "max": 18861, + "sd": 3132.780684756446, + "avg": 12122.32142857143 + } + } } - } - } - }, { - "name": "amount", - "display_name": "amount", - "base_type": "type/Decimal", - "effective_type": "type/Decimal", - "field_ref": ["field", "amount", { - "base-type": "type/Decimal" - }], - "semantic_type": null, - "fingerprint": { - "global": { - "distinct-count": 11, - "nil%": 0.0 }, - "type": { - "type/Number": { - "min": 0.99, - "q1": 2.399411317392306, - "q3": 5.52734176879965, - "max": 10.99, - "sd": 2.352151368009511, - "avg": 4.1405 + { + "display_name": "IOS_VIEWERS", + "field_ref": [ + "field", + "IOS_VIEWERS", + { + "base-type": "type/Number" + } + ], + "name": "IOS_VIEWERS", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 28, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 6477, + "q1": 7481.5, + "q3": 10428.5, + "max": 13182, + "sd": 1948.047456520796, + "avg": 9075.17857142857 + } + } } - } - } - }, { - "name": "payment_date", - "display_name": "payment_date", - "base_type": "type/DateTime", - "effective_type": "type/DateTime", - "field_ref": ["field", "payment_date", { - "base-type": "type/DateTime" - }], - "semantic_type": null, - "fingerprint": { - "global": { - "distinct-count": 1998, - "nil%": 0.0 }, - "type": { - "type/DateTime": { - "earliest": "2007-02-14T21:21:59.996577Z", - "latest": "2007-02-21T19:27:46.996577Z" + { + "display_name": "IOS_REDACTED", + "field_ref": [ + "field", + "IOS_REDACTED", + { + "base-type": "type/Number" + } + ], + "name": "IOS_REDACTED", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 28, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 1470, + "q1": 3020, + "q3": 3806, + "max": 4670, + "sd": 665.7415088559197, + "avg": 3415.8571428571427 + } + } } - } - } - }, { - "name": "rental_id", - "display_name": "rental_id", - "base_type": "type/Integer", - "effective_type": "type/Integer", - "field_ref": ["field", "rental_id", { - "base-type": "type/Integer" - }], - "semantic_type": null, - "fingerprint": { - "global": { - "distinct-count": 2000, - "nil%": 0.0 }, - "type": { - "type/Number": { - "min": 1158.0, - "q1": 1731.7967120913397, - "q3": 2871.359273326854, - "max": 4591.0, - "sd": 660.7468728104022, - "avg": 2303.4565 + { + "display_name": "IOS_REDACTED", + "field_ref": [ + "field", + "IOS_REDACTED", + { + "base-type": "type/Number" + } + ], + "name": "IOS_REDACTED", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 28, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 4872, + "q1": 15019.5, + "q3": 20457, + "max": 27466, + "sd": 4688.492913816769, + "avg": 17683.89285714286 + } + } + } + }, + { + "display_name": "IOS_REDACTED/IOS_VIEWERS", + "field_ref": [ + "field", + "IOS_REDACTED/IOS_VIEWERS", + { + "base-type": "type/Number" + } + ], + "name": "IOS_REDACTED/IOS_VIEWERS", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 28, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 0.662587, + "q1": 1.8403745, + "q3": 2.241517, + "max": 2.576166, + "sd": 0.4488826998266724, + "avg": 1.974007857142857 + } + } + } + }, + { + "display_name": "AND_REDACTED/AND_VIEWERS", + "field_ref": [ + "field", + "AND_REDACTED/AND_VIEWERS", + { + "base-type": "type/Number" + } + ], + "name": "AND_REDACTED/AND_VIEWERS", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 28, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 0.671656, + "q1": 1.3536655, + "q3": 2.5325145, + "max": 3.097553, + "sd": 0.6816847359625038, + "avg": 1.93937275 + } + } + } + }, + { + "display_name": "IOS_REDACTED/IOS_VIEWERS", + "field_ref": [ + "field", + "IOS_REDACTED/IOS_VIEWERS", + { + "base-type": "type/Number" + } + ], + "name": "IOS_REDACTED/IOS_VIEWERS", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 28, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 0.199918, + "q1": 0.34496099999999996, + "q3": 0.4352085, + "max": 0.47286, + "sd": 0.06928869477079941, + "avg": 0.3833206785714286 + } + } + } + }, + { + "display_name": "AND_REDACTED/AND_VIEWERS", + "field_ref": [ + "field", + "AND_REDACTED/AND_VIEWERS", + { + "base-type": "type/Number" + } + ], + "name": "AND_REDACTED/AND_VIEWERS", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 28, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 0.179613, + "q1": 0.245343, + "q3": 0.475772, + "max": 0.522253, + "sd": 0.11732033433182058, + "avg": 0.3620892142857142 + } + } } } - } - }], - "database_id": 2, - "enable_embedding": false, - "collection_id": null, - "query_type": "native", - "name": "Customer Payment", - "query_average_duration": 820, - "creator_id": 1, - "moderation_reviews": [], - "updated_at": "2021-12-13T17:48:40.478", - "made_public_by_id": null, - "embedding_params": null, - "cache_ttl": null, - "dataset_query": { - "type": "native", - "native": { - "query": "SELECT\n\tcustomer.customer_id,\n\tfirst_name,\n\tlast_name,\n\tamount,\n\tpayment_date,\n\trental_id\nFROM\n\tcustomer\nINNER JOIN payment \n ON payment.customer_id = customer.customer_id\nORDER BY payment_date", - "template-tags": {} + ], + "can_write": true, + "database_id": 3, + "enable_embedding": false, + "collection_id": 112, + "query_type": "native", + "name": "REDACTED iOS vs. Android", + "query_average_duration": 50982, + "creator_id": 42, + "moderation_reviews": [], + "updated_at": "2024-01-16T13:34:29.916717Z", + "made_public_by_id": null, + "embedding_params": null, + "cache_ttl": null, + "dataset_query": { + "type": "native", + "native": { + "query": "-- 1. Table with redacted search users Android\n-- 2. Table with redacted search users iOS \n-- 3. Redacted from Android redacted\n-- 4. redacted from iOS\n-- 5. Compare the numbers iOS vs. Android\n\n\n-- 1. Table with redacted search users Android (to include date, platform, auth_account_id)\n-- 2. Table with redacted search users iOS (to include date, platform, auth_account_id)\n-- 3. Redacted from Android redacted (to include date, platform, count of redacted)\n-- 4. Redacted from iOS redacted (to include date, plaform, count of redacted)\n-- 5. Compare the numbers iOS vs. Android\n\nwith AND_viewers as \n(\nselect event_date, platform, auth_account_id \nfrom TEAMS_PRD.REDACTED.MRT_CURR__MPARTICLE_SCREEN_VIEWS\nwhere screen_name='redacted_search'\nand event_date>'2023-12-01'\nand platform='Android'\nand dayofweekiso(event_date) NOT IN (6,7)\ngroup by event_date, platform, auth_account_id\norder by event_date desc\n), \niOS_viewers as \n(\nselect event_date, platform, auth_account_id \nfrom TEAMS_PRD.REDACTED.MRT_CURR__MPARTICLE_SCREEN_VIEWS\nwhere screen_name='redacted_search'\nand event_date>'2023-12-01'\nand platform='iOS'\nand dayofweekiso(event_date) NOT IN (6,7)\ngroup by event_date, platform, auth_account_id\norder by event_date desc\n), \nAND_redacted as\n(\nselect redacted_ts::date as redacted_date, platform, count(distinct at.auth_account_id) as AND_redacted, count(group_redacted_id) as AND_redacted\nfrom TEAMS_PRD.REDACTED.MRT_CURR__REDACTED_CUSTOMER at\njoin AND_viewers av on av.event_date=at.redacted_ts::date and av.auth_account_id=at.auth_account_id\nwhere instrument_type='REDACTED'\ngroup by 1,2\norder by 1 desc\n), \niOS_redacted as\n(\nselect redacted_ts::date as redacted_date, platform, count(distinct it.auth_account_id) as iOS_redacted, count(group_redacted_id) as iOS_redacted\nfrom TEAMS_PRD.REDACTED.MRT_CURR__REDACTED_CUSTOMER it\njoin iOS_viewers iv on iv.event_date=it.redacted_ts::date and iv.auth_account_id=it.auth_account_id\nwhere instrument_type='REDACTED'\ngroup by 1,2\norder by 1 desc\n)\nselect a.event_date, count(distinct a.auth_account_id) as AND_viewers, AND_redacted, AND_redacted, count(distinct i.auth_account_id) as iOS_viewers, iOS_redacted, iOS_redacted, iOS_redacted/iOS_viewers, AND_redacted/AND_viewers, iOS_redacted/iOS_viewers, AND_redacted/AND_viewers\nfrom AND_VIEWERS a\njoin AND_redacted at\non a.event_date=at.redacted_date\njoin ios_viewers i\non a.event_date=i.event_date\njoin ios_redacted it\non i.event_date=it.redacted_date\ngroup by 1, 3, 4, 6, 7\norder by 1 desc\n\n\n", + "template-tags": {} + }, + "database": 3 }, - "database": 2 - }, - "id": 1, - "display": "table", - "visualization_settings": { - "table.pivot_column": "amount", - "table.cell_column": "customer_id" + "id": 1, + "parameter_mappings": [], + "display": "line", + "entity_id": "DhQgvvtTEarZH8yQBlqES", + "collection_preview": true, + "visualization_settings": { + "graph.dimensions": [ + "EVENT_DATE" + ], + "series_settings": { + "IOS_REDACTED/IOS_VIEWERS": { + "axis": "right" + }, + "AND_REDACTED/AND_VIEWERS": { + "axis": "right" + } + }, + "graph.metrics": [ + "IOS_REDACTED/IOS_VIEWERS", + "AND_REDACTED/AND_VIEWERS", + "AND_VIEWERS", + "IOS_VIEWERS" + ] + }, + "metabase_version": "v0.48.3 (80d8323)", + "parameters": [], + "dataset": false, + "created_at": "2024-01-16T09:44:49.407327Z", + "public_uuid": null }, - "created_at": "2021-12-13T17:46:32.77", - "public_uuid": null + "updated_at": "2024-01-16T09:45:45.410379Z", + "col": 0, + "id": 12, + "parameter_mappings": [], + "card_id": 1, + "entity_id": "tA9M9vJlTHG0KxQnvknKW", + "visualization_settings": {}, + "size_y": 6, + "dashboard_id": 1, + "created_at": "2024-01-16T09:45:45.410379Z", + "row": 0 }, - "updated_at": "2021-12-13T17:48:41.68", - "col": 0, - "id": 1, - "parameter_mappings": [], - "card_id": 1, - "visualization_settings": {}, - "dashboard_id": 1, - "created_at": "2021-12-13T17:46:52.278", - "sizeY": 4, - "row": 0 - }, { - "sizeX": 4, - "series": [], - "collection_authority_level": null, - "card": { - "description": null, - "archived": false, - "collection_position": null, - "table_id": 21, - "result_metadata": [{ - "semantic_type": "type/Category", - "coercion_strategy": null, - "name": "rating", - "field_ref": ["field", 131, null], - "effective_type": "type/*", - "id": 131, - "display_name": "Rating", - "fingerprint": { - "global": { - "distinct-count": 5, - "nil%": 0.0 + { + "size_x": 12, + "dashboard_tab_id": null, + "series": [], + "action_id": null, + "collection_authority_level": null, + "card": { + "description": null, + "archived": false, + "collection_position": null, + "table_id": null, + "result_metadata": [ + { + "display_name": "CALENDAR_DATE", + "field_ref": [ + "field", + "CALENDAR_DATE", + { + "base-type": "type/Date" + } + ], + "name": "CALENDAR_DATE", + "base_type": "type/Date", + "effective_type": "type/Date", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 30, + "nil%": 0 + }, + "type": { + "type/DateTime": { + "earliest": "2023-12-17T00:00:00Z", + "latest": "2024-01-15T00:00:00Z" + } + } + } + }, + { + "display_name": "REDACTED", + "field_ref": [ + "field", + "REDACTED", + { + "base-type": "type/Number" + } + ], + "name": "REDACTED", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 27, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 682175, + "q1": 738644, + "q3": 805974, + "max": 847312, + "sd": 46783.99996291344, + "avg": 775505.5666666667 + } + } + } }, - "type": { - "type/Text": { - "percent-json": 0.0, - "percent-url": 0.0, - "percent-email": 0.0, - "percent-state": 0.0, - "average-length": 2.926 + { + "display_name": "REDACTEDRS", + "field_ref": [ + "field", + "REDACTEDRS", + { + "base-type": "type/Number" + } + ], + "name": "REDACTEDRS", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 27, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 46173, + "q1": 47556.94427191, + "q3": 48890, + "max": 50769, + "sd": 1164.9989906758983, + "avg": 48354.8 + } + } + } + }, + { + "display_name": "REDACTED/REDACTEDRS", + "field_ref": [ + "field", + "REDACTED/REDACTEDRS", + { + "base-type": "type/Number" + } + ], + "name": "REDACTED/REDACTEDRS", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 27, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 14.706168, + "q1": 15.398378, + "q3": 16.920933, + "max": 17.289964, + "sd": 0.8020030995826715, + "avg": 16.033017833333336 + } + } } } + ], + "can_write": true, + "database_id": 3, + "enable_embedding": false, + "collection_id": 112, + "query_type": "native", + "name": "Redacted redacted per redacted user", + "query_average_duration": 20433, + "creator_id": 1, + "moderation_reviews": [], + "updated_at": "2024-01-16T13:34:29.916788Z", + "made_public_by_id": null, + "embedding_params": null, + "cache_ttl": null, + "dataset_query": { + "type": "native", + "native": { + "query": "with dd as (\nselect distinct calendar_date as calendar_date from TEAMS_PRD.DATA_PLATFORM_MART.MRT__CALENDAR_DATES\nwhere calendar_date>'2022-01-01'\n), \nredacted as\n(\nselect dd.calendar_date, count(distinct auth_account_id) as redacted, max(redacted_ts), min(redacted_ts)\nfrom TEAMS_PRD.REDACTED.MRT_CURR__REDACTED_CUSTOMER t\njoin dd on redacted_ts::date BETWEEN dd.calendar_date-29 and dd.calendar_date\nwhere redacted_type='REGULAR'\nand instrument_type = 'REDACTED'\ngroup by dd.calendar_date\norder by dd.calendar_date desc\n),\nredacted as\n(\nselect dd.calendar_date, count(group_redacted_id) as redacted, max(redacted_ts), min(redacted_ts)\nfrom TEAMS_PRD.REDACTED.MRT_CURR__REDACTED_CUSTOMER t\njoin dd on redacted_ts::date BETWEEN dd.calendar_date-29 and dd.calendar_date\nwhere redacted_type='REGULAR'\nand instrument_type = 'REDACTED'\ngroup by dd.calendar_date\norder by dd.calendar_date desc\n)\nselect dd.calendar_date, redacted, redacted, redacted/redacted\nfrom dd\njoin redacted t on dd.calendar_date=t.calendar_date\njoin redacted tr on dd.calendar_date=tr.calendar_date\ngroup by dd.calendar_date, redacted, redacted, redacted/redacted\norder by dd.calendar_date desc \nlimit 30", + "template-tags": {} + }, + "database": 3 + }, + "id": 2, + "parameter_mappings": [], + "display": "line", + "entity_id": "b1jUcPcQM0XFMuviv4g3K", + "collection_preview": true, + "visualization_settings": { + "graph.dimensions": [ + "CALENDAR_DATE" + ], + "series_settings": { + "REDACTEDRS": { + "axis": "right" + } + }, + "graph.metrics": [ + "REDACTED/REDACTEDRS", + "REDACTEDRS" + ] }, - "base_type": "type/PostgresEnum" - }, { - "name": "count", - "display_name": "Count", - "base_type": "type/BigInteger", - "effective_type": "type/BigInteger", - "semantic_type": "type/Quantity", - "field_ref": ["aggregation", 0], - "fingerprint": { - "global": { - "distinct-count": 5, - "nil%": 0.0 + "metabase_version": "v0.48.3 (80d8323)", + "parameters": [], + "dataset": false, + "created_at": "2024-01-16T09:50:09.487369Z", + "public_uuid": null + }, + "updated_at": "2024-01-16T09:50:34.394488Z", + "col": 12, + "id": 1, + "parameter_mappings": [], + "card_id": 2, + "entity_id": "lXypX5aa14HjkN_Im82C2", + "visualization_settings": {}, + "size_y": 6, + "dashboard_id": 1, + "created_at": "2024-01-16T09:50:34.394488Z", + "row": 0 + }, + { + "size_x": 12, + "dashboard_tab_id": null, + "series": [], + "action_id": null, + "collection_authority_level": null, + "card": { + "description": null, + "archived": false, + "collection_position": null, + "table_id": null, + "result_metadata": [ + { + "display_name": "EVENT_DATE", + "field_ref": [ + "field", + "EVENT_DATE", + { + "base-type": "type/Date" + } + ], + "name": "EVENT_DATE", + "base_type": "type/Date", + "effective_type": "type/Date", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 11, + "nil%": 0 + }, + "type": { + "type/DateTime": { + "earliest": "2024-01-01T00:00:00Z", + "latest": "2024-01-15T00:00:00Z" + } + } + } + }, + { + "display_name": "KNOCKOUT", + "field_ref": [ + "field", + "KNOCKOUT", + { + "base-type": "type/Number" + } + ], + "name": "KNOCKOUT", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 11, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 175, + "q1": 853.75, + "q3": 1116.75, + "max": 1174, + "sd": 296.0767713709648, + "avg": 916.3636363636364 + } + } + } + }, + { + "display_name": "EXPIRY", + "field_ref": [ + "field", + "EXPIRY", + { + "base-type": "type/Number" + } + ], + "name": "EXPIRY", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 10, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 78, + "q1": 295.5, + "q3": 408.3925271309261, + "max": 431, + "sd": 105.10704500218294, + "avg": 336.90909090909093 + } + } + } }, - "type": { - "type/Number": { - "min": 178.0, - "q1": 190.0, - "q3": 213.25, - "max": 223.0, - "sd": 17.131841699011815, - "avg": 200.0 + { + "display_name": "PRODUCT", + "field_ref": [ + "field", + "PRODUCT", + { + "base-type": "type/Number" + } + ], + "name": "PRODUCT", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 9, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 57, + "q1": 163.75, + "q3": 233, + "max": 255, + "sd": 59.31119777763877, + "avg": 195.27272727272728 + } + } + } + }, + { + "display_name": "ISSUER", + "field_ref": [ + "field", + "ISSUER", + { + "base-type": "type/Number" + } + ], + "name": "ISSUER", + "base_type": "type/Number", + "effective_type": "type/Number", + "semantic_type": null, + "fingerprint": { + "global": { + "distinct-count": 10, + "nil%": 0 + }, + "type": { + "type/Number": { + "min": 43, + "q1": 214, + "q3": 292.25, + "max": 304, + "sd": 79.35879397910594, + "avg": 245.72727272727272 + } + } } } - } - }], - "database_id": 2, - "enable_embedding": false, - "collection_id": null, - "query_type": "query", - "name": "Films, Count, Grouped by Rating, Filtered by Release Year, Sorted by [Unknown Field] descending", - "query_average_duration": 25, - "creator_id": 1, - "moderation_reviews": [], - "updated_at": "2021-12-13T17:48:39.999", - "made_public_by_id": null, - "embedding_params": null, - "cache_ttl": null, - "dataset_query": { - "query": { - "source-table": 21, - "breakout": [ - ["field", 131, null] - ], - "aggregation": [ - ["count"] - ], - "order-by": [ - ["desc", ["aggregation", 0]] + ], + "can_write": true, + "database_id": 3, + "enable_embedding": false, + "collection_id": 112, + "query_type": "native", + "name": "Filter popularity", + "query_average_duration": 2830, + "creator_id": 1, + "moderation_reviews": [], + "updated_at": "2024-01-16T13:34:30.128815Z", + "made_public_by_id": null, + "embedding_params": null, + "cache_ttl": null, + "dataset_query": { + "type": "native", + "native": { + "query": "with issuer as\n(\n select event_date, count(*) as issuer_clicks, count(distinct auth_account_id) as issuer\n from TEAMS_PRD.REDACTED.MRT_CURR__MPARTICLE_EVENTS\n where event_name='redacted_search_filter_button_tapped' \n and event_attributes:filter_option::varchar='issuer'\n and event_date>'2023-12-31'\n and platform='Android'\n and dayofweekiso(event_date) NOT IN (6,7)\n and event_attributes:redacted_type::varchar='knock_out_product'\n group by 1\n order by 1 desc\n), expiry as\n(\n select event_date, count(*) as expiry_clicks, count(distinct auth_account_id) as expiry\n from TEAMS_PRD.REDACTED.MRT_CURR__MPARTICLE_EVENTS\n where event_name='redacted_search_filter_button_tapped' \n and event_attributes:filter_option::varchar='expiry'\n and event_date>'2023-12-31'\n and platform='Android'\n and dayofweekiso(event_date) NOT IN (6,7)\n and event_attributes:redacted_type::varchar='knock_out_product'\n group by 1\n order by 1 desc\n), product as\n(\n select event_date, count(*) as product_clicks, count(distinct auth_account_id) as product\n from TEAMS_PRD.REDACTED.MRT_CURR__MPARTICLE_EVENTS\n where event_name='redacted_search_filter_button_tapped' \n and event_attributes:filter_option::varchar='product'\n and event_date>'2023-12-31'\n and platform='Android'\n and dayofweekiso(event_date) NOT IN (6,7)\n and event_attributes:redacted_type::varchar='knock_out_product'\n group by 1\n order by 1 desc\n), knockout as \n(\n select event_date, count(*) as knockout_clicks, count(distinct auth_account_id) as knockout\n from TEAMS_PRD.SCHEMA.MRT_CURR__MPARTICLE_EVENTS\n where event_name='redacted_search_filter_button_tapped' \n and event_attributes:filter_option::varchar='knockout'\n and event_date>'2023-12-31'\n and platform='Android'\n and dayofweekiso(event_date) NOT IN (6,7)\n and event_attributes:redacted_type::varchar='knock_out_product'\n group by 1\n order by 1 desc\n)\nselect k.event_date, knockout, expiry, product, issuer\nfrom knockout k\njoin expiry e on k.event_date=e.event_date\njoin issuer i on k.event_date=i.event_date\njoin product p on k.event_date=p.event_date\nwhere k.event_date Date: Fri, 26 Jan 2024 20:54:06 +0200 Subject: [PATCH 03/14] fix(ingest/glue): Profiling breaks for non-partitioned tables due to absent `Table.PartitionKeys` (#9591) --- metadata-ingestion/src/datahub/ingestion/source/aws/glue.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 826c18f69fd013..93601533bf8d6d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -833,9 +833,8 @@ def get_profile_if_enabled( **{k: v for k, v in kwargs.items() if v} ) - partition_keys = response["Table"]["PartitionKeys"] - # check if this table is partitioned + partition_keys = response["Table"].get("PartitionKeys") if partition_keys: # ingest data profile with partitions # for cross-account ingestion From 051f570c47386540266e088d396feed70784f9d5 Mon Sep 17 00:00:00 2001 From: RyanHolstien Date: Fri, 26 Jan 2024 14:17:14 -0600 Subject: [PATCH 04/14] fix(search): fix filters for hasX and numValues fields (#9729) --- .../metadata/models/ConfigEntitySpec.java | 12 ++++ .../metadata/models/DefaultEntitySpec.java | 12 ++++ .../linkedin/metadata/models/EntitySpec.java | 45 +++++++++++---- .../elasticsearch/query/ESBrowseDAO.java | 8 +-- .../elasticsearch/query/ESSearchDAO.java | 3 +- .../request/AutocompleteRequestHandler.java | 10 ++-- .../query/request/SearchRequestHandler.java | 13 +++-- .../metadata/search/utils/ESUtils.java | 55 +++++++++---------- .../ElasticSearchTimeseriesAspectService.java | 32 ++++++----- .../elastic/query/ESAggregatedStatsDAO.java | 2 +- .../fixtures/SampleDataFixtureTestBase.java | 54 ++++++++++++++++++ 11 files changed, 175 insertions(+), 71 deletions(-) diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/ConfigEntitySpec.java b/entity-registry/src/main/java/com/linkedin/metadata/models/ConfigEntitySpec.java index b235e2adcae11a..8bd89071e299d2 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/ConfigEntitySpec.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/ConfigEntitySpec.java @@ -3,10 +3,12 @@ import com.linkedin.data.schema.RecordDataSchema; import com.linkedin.data.schema.TyperefDataSchema; import com.linkedin.metadata.models.annotation.EntityAnnotation; +import com.linkedin.metadata.models.annotation.SearchableAnnotation; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; import javax.annotation.Nonnull; @@ -19,6 +21,7 @@ public class ConfigEntitySpec implements EntitySpec { private final Map _aspectSpecs; private List _searchableFieldSpecs; + private Map> searchableFieldTypeMap; public ConfigEntitySpec( @Nonnull final String entityName, @@ -89,4 +92,13 @@ public List getSearchableFieldSpecs() { return _searchableFieldSpecs; } + + @Override + public Map> getSearchableFieldTypes() { + if (searchableFieldTypeMap == null) { + searchableFieldTypeMap = EntitySpec.super.getSearchableFieldTypes(); + } + + return searchableFieldTypeMap; + } } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/DefaultEntitySpec.java b/entity-registry/src/main/java/com/linkedin/metadata/models/DefaultEntitySpec.java index 5db8ca264f69dd..2546674f9835cb 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/DefaultEntitySpec.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/DefaultEntitySpec.java @@ -3,10 +3,12 @@ import com.linkedin.data.schema.RecordDataSchema; import com.linkedin.data.schema.TyperefDataSchema; import com.linkedin.metadata.models.annotation.EntityAnnotation; +import com.linkedin.metadata.models.annotation.SearchableAnnotation; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; import javax.annotation.Nonnull; @@ -24,6 +26,7 @@ public class DefaultEntitySpec implements EntitySpec { private final TyperefDataSchema _aspectTyperefSchema; private List _searchableFieldSpecs; + private Map> searchableFieldTypeMap; public DefaultEntitySpec( @Nonnull final Collection aspectSpecs, @@ -102,4 +105,13 @@ public List getSearchableFieldSpecs() { return _searchableFieldSpecs; } + + @Override + public Map> getSearchableFieldTypes() { + if (searchableFieldTypeMap == null) { + searchableFieldTypeMap = EntitySpec.super.getSearchableFieldTypes(); + } + + return searchableFieldTypeMap; + } } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/EntitySpec.java b/entity-registry/src/main/java/com/linkedin/metadata/models/EntitySpec.java index fac08c7e206463..9a75cc1f751d3b 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/EntitySpec.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/EntitySpec.java @@ -3,7 +3,9 @@ import com.linkedin.data.schema.RecordDataSchema; import com.linkedin.data.schema.TyperefDataSchema; import com.linkedin.metadata.models.annotation.EntityAnnotation; +import com.linkedin.metadata.models.annotation.SearchableAnnotation; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -39,16 +41,39 @@ default List getSearchableFieldSpecs() { .collect(Collectors.toList()); } - default Map> getSearchableFieldSpecMap() { - return getSearchableFieldSpecs().stream() - .collect( - Collectors.toMap( - searchableFieldSpec -> searchableFieldSpec.getSearchableAnnotation().getFieldName(), - searchableFieldSpec -> new HashSet<>(Collections.singleton(searchableFieldSpec)), - (set1, set2) -> { - set1.addAll(set2); - return set1; - })); + default Map> getSearchableFieldTypes() { + // Get additional fields and mint SearchableFieldSpecs for them + Map> fieldSpecMap = new HashMap<>(); + for (SearchableFieldSpec fieldSpec : getSearchableFieldSpecs()) { + SearchableAnnotation searchableAnnotation = fieldSpec.getSearchableAnnotation(); + if (searchableAnnotation.getNumValuesFieldName().isPresent()) { + String fieldName = searchableAnnotation.getNumValuesFieldName().get(); + Set fieldSet = new HashSet<>(); + fieldSet.add(SearchableAnnotation.FieldType.COUNT); + fieldSpecMap.put(fieldName, fieldSet); + } + if (searchableAnnotation.getHasValuesFieldName().isPresent()) { + String fieldName = searchableAnnotation.getHasValuesFieldName().get(); + Set fieldSet = new HashSet<>(); + fieldSet.add(SearchableAnnotation.FieldType.BOOLEAN); + fieldSpecMap.put(fieldName, fieldSet); + } + } + fieldSpecMap.putAll( + getSearchableFieldSpecs().stream() + .collect( + Collectors.toMap( + searchableFieldSpec -> + searchableFieldSpec.getSearchableAnnotation().getFieldName(), + searchableFieldSpec -> + new HashSet<>( + Collections.singleton( + searchableFieldSpec.getSearchableAnnotation().getFieldType())), + (set1, set2) -> { + set1.addAll(set2); + return set1; + }))); + return fieldSpecMap; } default List getSearchScoreFieldSpecs() { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java index d610ea4b4e028a..0a9a9fbbad0867 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java @@ -19,7 +19,7 @@ import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.models.EntitySpec; -import com.linkedin.metadata.models.SearchableFieldSpec; +import com.linkedin.metadata.models.annotation.SearchableAnnotation; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.search.elasticsearch.query.request.SearchRequestHandler; @@ -557,7 +557,7 @@ private QueryBuilder buildQueryStringV2( queryBuilder.filter(QueryBuilders.rangeQuery(BROWSE_PATH_V2_DEPTH).gt(browseDepthVal)); queryBuilder.filter( - SearchRequestHandler.getFilterQuery(filter, entitySpec.getSearchableFieldSpecMap())); + SearchRequestHandler.getFilterQuery(filter, entitySpec.getSearchableFieldTypes())); return queryBuilder; } @@ -583,9 +583,9 @@ private QueryBuilder buildQueryStringBrowseAcrossEntities( queryBuilder.filter(QueryBuilders.rangeQuery(BROWSE_PATH_V2_DEPTH).gt(browseDepthVal)); - Map> searchableFields = + Map> searchableFields = entitySpecs.stream() - .flatMap(entitySpec -> entitySpec.getSearchableFieldSpecMap().entrySet().stream()) + .flatMap(entitySpec -> entitySpec.getSearchableFieldTypes().entrySet().stream()) .collect( Collectors.toMap( Map.Entry::getKey, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java index 1ec90ed6f61e29..7de2770626ae34 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java @@ -78,8 +78,7 @@ public long docCount(@Nonnull String entityName) { EntitySpec entitySpec = entityRegistry.getEntitySpec(entityName); CountRequest countRequest = new CountRequest(indexConvention.getIndexName(entitySpec)) - .query( - SearchRequestHandler.getFilterQuery(null, entitySpec.getSearchableFieldSpecMap())); + .query(SearchRequestHandler.getFilterQuery(null, entitySpec.getSearchableFieldTypes())); try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "docCount").time()) { return client.count(countRequest, RequestOptions.DEFAULT).getCount(); } catch (IOException e) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandler.java index 333d9602734d25..38350322478741 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandler.java @@ -41,7 +41,7 @@ public class AutocompleteRequestHandler { private final List _defaultAutocompleteFields; - private final Map> searchableFields; + private final Map> searchableFieldTypes; private static final Map AUTOCOMPLETE_QUERY_BUILDER_BY_ENTITY_NAME = new ConcurrentHashMap<>(); @@ -56,14 +56,16 @@ public AutocompleteRequestHandler(@Nonnull EntitySpec entitySpec) { .map(SearchableAnnotation::getFieldName), Stream.of("urn")) .collect(Collectors.toList()); - searchableFields = + searchableFieldTypes = fieldSpecs.stream() .collect( Collectors.toMap( searchableFieldSpec -> searchableFieldSpec.getSearchableAnnotation().getFieldName(), searchableFieldSpec -> - new HashSet<>(Collections.singleton(searchableFieldSpec)), + new HashSet<>( + Collections.singleton( + searchableFieldSpec.getSearchableAnnotation().getFieldType())), (set1, set2) -> { set1.addAll(set2); return set1; @@ -81,7 +83,7 @@ public SearchRequest getSearchRequest( SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); searchSourceBuilder.size(limit); searchSourceBuilder.query(getQuery(input, field)); - searchSourceBuilder.postFilter(ESUtils.buildFilterQuery(filter, false, searchableFields)); + searchSourceBuilder.postFilter(ESUtils.buildFilterQuery(filter, false, searchableFieldTypes)); searchSourceBuilder.highlighter(getHighlights(field)); searchRequest.source(searchSourceBuilder); return searchRequest; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index e6ee909c80dae4..277e15e1334d56 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -97,7 +97,7 @@ public class SearchRequestHandler { private final SearchConfiguration _configs; private final SearchQueryBuilder _searchQueryBuilder; private final AggregationQueryBuilder _aggregationQueryBuilder; - private final Map> searchableFields; + private final Map> searchableFieldTypes; private SearchRequestHandler( @Nonnull EntitySpec entitySpec, @@ -122,9 +122,9 @@ private SearchRequestHandler( _searchQueryBuilder = new SearchQueryBuilder(configs, customSearchConfiguration); _aggregationQueryBuilder = new AggregationQueryBuilder(configs, annotations); _configs = configs; - searchableFields = + searchableFieldTypes = _entitySpecs.stream() - .flatMap(entitySpec -> entitySpec.getSearchableFieldSpecMap().entrySet().stream()) + .flatMap(entitySpec -> entitySpec.getSearchableFieldTypes().entrySet().stream()) .collect( Collectors.toMap( Map.Entry::getKey, @@ -182,12 +182,13 @@ private BinaryOperator mapMerger() { } public BoolQueryBuilder getFilterQuery(@Nullable Filter filter) { - return getFilterQuery(filter, searchableFields); + return getFilterQuery(filter, searchableFieldTypes); } public static BoolQueryBuilder getFilterQuery( - @Nullable Filter filter, Map> searchableFields) { - BoolQueryBuilder filterQuery = ESUtils.buildFilterQuery(filter, false, searchableFields); + @Nullable Filter filter, + Map> searchableFieldTypes) { + BoolQueryBuilder filterQuery = ESUtils.buildFilterQuery(filter, false, searchableFieldTypes); return filterSoftDeletedByDefault(filter, filterQuery); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java index 77a67f100895c8..4d74bfb66b8dbc 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java @@ -132,7 +132,7 @@ private ESUtils() {} public static BoolQueryBuilder buildFilterQuery( @Nullable Filter filter, boolean isTimeseries, - final Map> searchableFields) { + final Map> searchableFieldTypes) { BoolQueryBuilder finalQueryBuilder = QueryBuilders.boolQuery(); if (filter == null) { return finalQueryBuilder; @@ -144,7 +144,7 @@ public static BoolQueryBuilder buildFilterQuery( .forEach( or -> finalQueryBuilder.should( - ESUtils.buildConjunctiveFilterQuery(or, isTimeseries, searchableFields))); + ESUtils.buildConjunctiveFilterQuery(or, isTimeseries, searchableFieldTypes))); } else if (filter.getCriteria() != null) { // Otherwise, build boolean query from the deprecated "criteria" field. log.warn("Received query Filter with a deprecated field 'criteria'. Use 'or' instead."); @@ -157,7 +157,7 @@ public static BoolQueryBuilder buildFilterQuery( || criterion.hasValues() || criterion.getCondition() == Condition.IS_NULL) { andQueryBuilder.must( - getQueryBuilderFromCriterion(criterion, isTimeseries, searchableFields)); + getQueryBuilderFromCriterion(criterion, isTimeseries, searchableFieldTypes)); } }); finalQueryBuilder.should(andQueryBuilder); @@ -169,7 +169,7 @@ public static BoolQueryBuilder buildFilterQuery( public static BoolQueryBuilder buildConjunctiveFilterQuery( @Nonnull ConjunctiveCriterion conjunctiveCriterion, boolean isTimeseries, - Map> searchableFields) { + Map> searchableFieldTypes) { final BoolQueryBuilder andQueryBuilder = new BoolQueryBuilder(); conjunctiveCriterion .getAnd() @@ -181,10 +181,10 @@ public static BoolQueryBuilder buildConjunctiveFilterQuery( if (!criterion.isNegated()) { // `filter` instead of `must` (enables caching and bypasses scoring) andQueryBuilder.filter( - getQueryBuilderFromCriterion(criterion, isTimeseries, searchableFields)); + getQueryBuilderFromCriterion(criterion, isTimeseries, searchableFieldTypes)); } else { andQueryBuilder.mustNot( - getQueryBuilderFromCriterion(criterion, isTimeseries, searchableFields)); + getQueryBuilderFromCriterion(criterion, isTimeseries, searchableFieldTypes)); } } }); @@ -222,7 +222,7 @@ public static BoolQueryBuilder buildConjunctiveFilterQuery( public static QueryBuilder getQueryBuilderFromCriterion( @Nonnull final Criterion criterion, boolean isTimeseries, - final Map> searchableFields) { + final Map> searchableFieldTypes) { final String fieldName = toFacetField(criterion.getField()); if (fieldName.startsWith(STRUCTURED_PROPERTY_MAPPING_FIELD)) { criterion.setField(fieldName); @@ -241,10 +241,11 @@ public static QueryBuilder getQueryBuilderFromCriterion( if (maybeFieldToExpand.isPresent()) { return getQueryBuilderFromCriterionForFieldToExpand( - maybeFieldToExpand.get(), criterion, isTimeseries, searchableFields); + maybeFieldToExpand.get(), criterion, isTimeseries, searchableFieldTypes); } - return getQueryBuilderFromCriterionForSingleField(criterion, isTimeseries, searchableFields); + return getQueryBuilderFromCriterionForSingleField( + criterion, isTimeseries, searchableFieldTypes); } public static String getElasticTypeForFieldType(SearchableAnnotation.FieldType fieldType) { @@ -446,7 +447,7 @@ private static QueryBuilder getQueryBuilderFromCriterionForFieldToExpand( @Nonnull final List fields, @Nonnull final Criterion criterion, final boolean isTimeseries, - final Map> searchableFields) { + final Map> searchableFieldTypes) { final BoolQueryBuilder orQueryBuilder = new BoolQueryBuilder(); for (String field : fields) { Criterion criterionToQuery = new Criterion(); @@ -461,7 +462,7 @@ private static QueryBuilder getQueryBuilderFromCriterionForFieldToExpand( criterionToQuery.setField(toKeywordField(field, isTimeseries)); orQueryBuilder.should( getQueryBuilderFromCriterionForSingleField( - criterionToQuery, isTimeseries, searchableFields)); + criterionToQuery, isTimeseries, searchableFieldTypes)); } return orQueryBuilder; } @@ -470,7 +471,7 @@ private static QueryBuilder getQueryBuilderFromCriterionForFieldToExpand( private static QueryBuilder getQueryBuilderFromCriterionForSingleField( @Nonnull Criterion criterion, boolean isTimeseries, - final Map> searchableFields) { + final Map> searchableFieldTypes) { final Condition condition = criterion.getCondition(); final String fieldName = toFacetField(criterion.getField()); @@ -485,10 +486,10 @@ private static QueryBuilder getQueryBuilderFromCriterionForSingleField( } else if (criterion.hasValues() || criterion.hasValue()) { if (condition == Condition.EQUAL) { return buildEqualsConditionFromCriterion( - fieldName, criterion, isTimeseries, searchableFields); + fieldName, criterion, isTimeseries, searchableFieldTypes); } else if (RANGE_QUERY_CONDITIONS.contains(condition)) { return buildRangeQueryFromCriterion( - criterion, fieldName, searchableFields, condition, isTimeseries); + criterion, fieldName, searchableFieldTypes, condition, isTimeseries); } else if (condition == Condition.CONTAIN) { return QueryBuilders.wildcardQuery( toKeywordField(criterion.getField(), isTimeseries), @@ -513,14 +514,14 @@ private static QueryBuilder buildEqualsConditionFromCriterion( @Nonnull final String fieldName, @Nonnull final Criterion criterion, final boolean isTimeseries, - final Map> searchableFields) { + final Map> searchableFieldTypes) { /* * If the newer 'values' field of Criterion.pdl is set, then we * handle using the following code to allow multi-match. */ if (!criterion.getValues().isEmpty()) { return buildEqualsConditionFromCriterionWithValues( - fieldName, criterion, isTimeseries, searchableFields); + fieldName, criterion, isTimeseries, searchableFieldTypes); } /* * Otherwise, we are likely using the deprecated 'value' field. @@ -537,8 +538,8 @@ private static QueryBuilder buildEqualsConditionFromCriterionWithValues( @Nonnull final String fieldName, @Nonnull final Criterion criterion, final boolean isTimeseries, - final Map> searchableFields) { - Set fieldTypes = getFieldTypes(searchableFields, fieldName); + final Map> searchableFieldTypes) { + Set fieldTypes = getFieldTypes(searchableFieldTypes, fieldName); if (fieldTypes.size() > 1) { log.warn( "Multiple field types for field name {}, determining best fit for set: {}", @@ -563,31 +564,27 @@ private static QueryBuilder buildEqualsConditionFromCriterionWithValues( } private static Set getFieldTypes( - Map> searchableFields, String fieldName) { - Set fieldSpecs = + Map> searchableFields, String fieldName) { + Set fieldTypes = searchableFields.getOrDefault(fieldName, Collections.emptySet()); - Set fieldTypes = - fieldSpecs.stream() - .map(SearchableFieldSpec::getSearchableAnnotation) - .map(SearchableAnnotation::getFieldType) - .map(ESUtils::getElasticTypeForFieldType) - .collect(Collectors.toSet()); + Set finalFieldTypes = + fieldTypes.stream().map(ESUtils::getElasticTypeForFieldType).collect(Collectors.toSet()); if (fieldTypes.size() > 1) { log.warn( "Multiple field types for field name {}, determining best fit for set: {}", fieldName, fieldTypes); } - return fieldTypes; + return finalFieldTypes; } private static RangeQueryBuilder buildRangeQueryFromCriterion( Criterion criterion, String fieldName, - Map> searchableFields, + Map> searchableFieldTypes, Condition condition, boolean isTimeseries) { - Set fieldTypes = getFieldTypes(searchableFields, fieldName); + Set fieldTypes = getFieldTypes(searchableFieldTypes, fieldName); // Determine criterion value, range query only accepts single value so take first value in // values if multiple diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java index 6cf8e92d61929f..cb06dc75c70bc9 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java @@ -14,7 +14,7 @@ import com.linkedin.metadata.aspect.EnvelopedAspect; import com.linkedin.metadata.models.AspectSpec; import com.linkedin.metadata.models.EntitySpec; -import com.linkedin.metadata.models.SearchableFieldSpec; +import com.linkedin.metadata.models.annotation.SearchableAnnotation; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.Criterion; @@ -296,7 +296,7 @@ public long countByFilter( ESUtils.buildFilterQuery( filter, true, - _entityRegistry.getEntitySpec(entityName).getSearchableFieldSpecMap())); + _entityRegistry.getEntitySpec(entityName).getSearchableFieldTypes())); CountRequest countRequest = new CountRequest(); countRequest.query(filterQueryBuilder); countRequest.indices(indexName); @@ -319,10 +319,11 @@ public List getAspectValues( @Nullable final Integer limit, @Nullable final Filter filter, @Nullable final SortCriterion sort) { - Map> searchableFields = - _entityRegistry.getEntitySpec(entityName).getSearchableFieldSpecMap(); + Map> searchableFieldTypes = + _entityRegistry.getEntitySpec(entityName).getSearchableFieldTypes(); final BoolQueryBuilder filterQueryBuilder = - QueryBuilders.boolQuery().must(ESUtils.buildFilterQuery(filter, true, searchableFields)); + QueryBuilders.boolQuery() + .must(ESUtils.buildFilterQuery(filter, true, searchableFieldTypes)); filterQueryBuilder.must(QueryBuilders.matchQuery("urn", urn.toString())); // NOTE: We are interested only in the un-exploded rows as only they carry the `event` payload. filterQueryBuilder.mustNot(QueryBuilders.termQuery(MappingsBuilder.IS_EXPLODED_FIELD, true)); @@ -333,7 +334,7 @@ public List getAspectValues( .setCondition(Condition.GREATER_THAN_OR_EQUAL_TO) .setValue(startTimeMillis.toString()); filterQueryBuilder.must( - ESUtils.getQueryBuilderFromCriterion(startTimeCriterion, true, searchableFields)); + ESUtils.getQueryBuilderFromCriterion(startTimeCriterion, true, searchableFieldTypes)); } if (endTimeMillis != null) { Criterion endTimeCriterion = @@ -342,7 +343,7 @@ public List getAspectValues( .setCondition(Condition.LESS_THAN_OR_EQUAL_TO) .setValue(endTimeMillis.toString()); filterQueryBuilder.must( - ESUtils.getQueryBuilderFromCriterion(endTimeCriterion, true, searchableFields)); + ESUtils.getQueryBuilderFromCriterion(endTimeCriterion, true, searchableFieldTypes)); } final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); searchSourceBuilder.query(filterQueryBuilder); @@ -412,7 +413,7 @@ public DeleteAspectValuesResult deleteAspectValues( final String indexName = _indexConvention.getTimeseriesAspectIndexName(entityName, aspectName); final BoolQueryBuilder filterQueryBuilder = ESUtils.buildFilterQuery( - filter, true, _entityRegistry.getEntitySpec(entityName).getSearchableFieldSpecMap()); + filter, true, _entityRegistry.getEntitySpec(entityName).getSearchableFieldTypes()); final Optional result = _bulkProcessor @@ -440,7 +441,7 @@ public String deleteAspectValuesAsync( final String indexName = _indexConvention.getTimeseriesAspectIndexName(entityName, aspectName); final BoolQueryBuilder filterQueryBuilder = ESUtils.buildFilterQuery( - filter, true, _entityRegistry.getEntitySpec(entityName).getSearchableFieldSpecMap()); + filter, true, _entityRegistry.getEntitySpec(entityName).getSearchableFieldTypes()); final int batchSize = options.getBatchSize() > 0 ? options.getBatchSize() : DEFAULT_LIMIT; TimeValue timeout = options.getTimeoutSeconds() > 0 @@ -466,7 +467,7 @@ public String reindexAsync( final String indexName = _indexConvention.getTimeseriesAspectIndexName(entityName, aspectName); final BoolQueryBuilder filterQueryBuilder = ESUtils.buildFilterQuery( - filter, true, _entityRegistry.getEntitySpec(entityName).getSearchableFieldSpecMap()); + filter, true, _entityRegistry.getEntitySpec(entityName).getSearchableFieldTypes()); try { return this.reindexAsync(indexName, filterQueryBuilder, options); } catch (Exception e) { @@ -515,10 +516,11 @@ public TimeseriesScrollResult scrollAspects( @Nullable Long startTimeMillis, @Nullable Long endTimeMillis) { - Map> searchableFields = - _entityRegistry.getEntitySpec(entityName).getSearchableFieldSpecMap(); + Map> searchableFieldTypes = + _entityRegistry.getEntitySpec(entityName).getSearchableFieldTypes(); final BoolQueryBuilder filterQueryBuilder = - QueryBuilders.boolQuery().filter(ESUtils.buildFilterQuery(filter, true, searchableFields)); + QueryBuilders.boolQuery() + .filter(ESUtils.buildFilterQuery(filter, true, searchableFieldTypes)); if (startTimeMillis != null) { Criterion startTimeCriterion = @@ -527,7 +529,7 @@ public TimeseriesScrollResult scrollAspects( .setCondition(Condition.GREATER_THAN_OR_EQUAL_TO) .setValue(startTimeMillis.toString()); filterQueryBuilder.filter( - ESUtils.getQueryBuilderFromCriterion(startTimeCriterion, true, searchableFields)); + ESUtils.getQueryBuilderFromCriterion(startTimeCriterion, true, searchableFieldTypes)); } if (endTimeMillis != null) { Criterion endTimeCriterion = @@ -536,7 +538,7 @@ public TimeseriesScrollResult scrollAspects( .setCondition(Condition.LESS_THAN_OR_EQUAL_TO) .setValue(endTimeMillis.toString()); filterQueryBuilder.filter( - ESUtils.getQueryBuilderFromCriterion(endTimeCriterion, true, searchableFields)); + ESUtils.getQueryBuilderFromCriterion(endTimeCriterion, true, searchableFieldTypes)); } SearchResponse response = diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/query/ESAggregatedStatsDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/query/ESAggregatedStatsDAO.java index f8b2cd85523576..580888e54b7007 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/query/ESAggregatedStatsDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/query/ESAggregatedStatsDAO.java @@ -379,7 +379,7 @@ public GenericTable getAggregatedStats( // Setup the filter query builder using the input filter provided. final BoolQueryBuilder filterQueryBuilder = ESUtils.buildFilterQuery( - filter, true, _entityRegistry.getEntitySpec(entityName).getSearchableFieldSpecMap()); + filter, true, _entityRegistry.getEntitySpec(entityName).getSearchableFieldTypes()); AspectSpec aspectSpec = getTimeseriesAspectSpec(entityName, aspectName); // Build and attach the grouping aggregations diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java index a1af2325ee0ed8..4742115b16e1bd 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java @@ -14,8 +14,10 @@ import com.datahub.authentication.Actor; import com.datahub.authentication.ActorType; import com.datahub.authentication.Authentication; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.linkedin.common.urn.Urn; +import com.linkedin.data.template.StringArray; import com.linkedin.datahub.graphql.generated.AutoCompleteResults; import com.linkedin.datahub.graphql.types.chart.ChartType; import com.linkedin.datahub.graphql.types.container.ContainerType; @@ -45,6 +47,7 @@ import com.linkedin.r2.RemoteInvocationException; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -64,6 +67,7 @@ import org.opensearch.search.sort.FieldSortBuilder; import org.opensearch.search.sort.SortBuilder; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.AssertJUnit; import org.testng.annotations.Test; public abstract class SampleDataFixtureTestBase extends AbstractTestNGSpringContextTests { @@ -1936,6 +1940,56 @@ public void testSortOrdering() { String.format("%s - Expected search results to have at least two results", query)); } + @Test + public void testFilterOnHasValuesField() { + AssertJUnit.assertNotNull(getSearchService()); + Filter filter = + new Filter() + .setOr( + new ConjunctiveCriterionArray( + new ConjunctiveCriterion() + .setAnd( + new CriterionArray( + ImmutableList.of( + new Criterion() + .setField("hasOwners") + .setValue("") + .setValues(new StringArray(ImmutableList.of("true")))))))); + SearchResult searchResult = + searchAcrossEntities( + getSearchService(), + "*", + SEARCHABLE_ENTITIES, + filter, + Collections.singletonList(DATASET_ENTITY_NAME)); + assertEquals(searchResult.getEntities().size(), 8); + } + + @Test + public void testFilterOnNumValuesField() { + AssertJUnit.assertNotNull(getSearchService()); + Filter filter = + new Filter() + .setOr( + new ConjunctiveCriterionArray( + new ConjunctiveCriterion() + .setAnd( + new CriterionArray( + ImmutableList.of( + new Criterion() + .setField("numInputDatasets") + .setValue("") + .setValues(new StringArray(ImmutableList.of("1")))))))); + SearchResult searchResult = + searchAcrossEntities( + getSearchService(), + "*", + SEARCHABLE_ENTITIES, + filter, + Collections.singletonList(DATA_JOB_ENTITY_NAME)); + assertEquals(searchResult.getEntities().size(), 4); + } + private Stream getTokens(AnalyzeRequest request) throws IOException { return getSearchClient() From 388b3ec0ac10f7e3d142c9bcbf9c89be6ea92853 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 26 Jan 2024 14:01:48 -0800 Subject: [PATCH 05/14] fix(ingest/airflow): fix plugin support for airflow 2.5.0 (#9719) --- .../src/datahub_airflow_plugin/_datahub_listener_module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_listener_module.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_listener_module.py index e16563400e397f..0e1ef69ebf18c7 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_listener_module.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_listener_module.py @@ -29,6 +29,6 @@ def on_task_instance_failed(previous_state, task_instance, session): if hasattr(_listener, "on_dag_run_running"): @hookimpl - def on_dag_run_running(dag_run, session): + def on_dag_run_running(dag_run, msg): assert _listener - _listener.on_dag_run_running(dag_run, session) + _listener.on_dag_run_running(dag_run, msg) From 5adb799f137a00c315144715786179ef4a6b2405 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 26 Jan 2024 14:02:52 -0800 Subject: [PATCH 06/14] fix(cli): fix example data contract yaml + update airflow codecov (#9707) --- .github/workflows/airflow-plugin.yml | 4 +- .../airflow-plugin/build.gradle | 2 +- .../airflow-plugin/tests/conftest.py | 11 +++++ .../pet_of_the_week.dhub.dc.yaml | 42 +++++++++++-------- 4 files changed, 39 insertions(+), 20 deletions(-) diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index 7ae7b87b0f5ceb..c5c75de4f7aeec 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -87,8 +87,8 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} directory: . fail_ci_if_error: false - flags: airflow-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }} - name: pytest-airflow + flags: airflow,airflow-${{ matrix.extra_pip_extras }} + name: pytest-airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_requirements }} verbose: true event-file: diff --git a/metadata-ingestion-modules/airflow-plugin/build.gradle b/metadata-ingestion-modules/airflow-plugin/build.gradle index dacf12dc020df4..9555f92c8831dd 100644 --- a/metadata-ingestion-modules/airflow-plugin/build.gradle +++ b/metadata-ingestion-modules/airflow-plugin/build.gradle @@ -108,7 +108,7 @@ task testQuick(type: Exec, dependsOn: installDevTest) { inputs.files(project.fileTree(dir: "src/", include: "**/*.py")) inputs.files(project.fileTree(dir: "tests/")) commandLine 'bash', '-x', '-c', - "source ${venv_name}/bin/activate && pytest -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" + "source ${venv_name}/bin/activate && pytest --cov-config=setup.cfg --cov-report xml:coverage_quick.xml -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" } diff --git a/metadata-ingestion-modules/airflow-plugin/tests/conftest.py b/metadata-ingestion-modules/airflow-plugin/tests/conftest.py index d2c45e723f1b00..994816ff037c8d 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/conftest.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/conftest.py @@ -1,6 +1,17 @@ +import pathlib +import site + + def pytest_addoption(parser): parser.addoption( "--update-golden-files", action="store_true", default=False, ) + + +# See https://coverage.readthedocs.io/en/latest/subprocess.html#configuring-python-for-sub-process-measurement +coverage_startup_code = "import coverage; coverage.process_startup()" +site_packages_dir = pathlib.Path(site.getsitepackages()[0]) +pth_file_path = site_packages_dir / "datahub_coverage_startup.pth" +pth_file_path.write_text(coverage_startup_code) diff --git a/metadata-ingestion/examples/data_contract/pet_of_the_week.dhub.dc.yaml b/metadata-ingestion/examples/data_contract/pet_of_the_week.dhub.dc.yaml index c73904403f678d..bd081172b2a27b 100644 --- a/metadata-ingestion/examples/data_contract/pet_of_the_week.dhub.dc.yaml +++ b/metadata-ingestion/examples/data_contract/pet_of_the_week.dhub.dc.yaml @@ -1,21 +1,29 @@ -# id: pet_details_dc # Optional: This is the unique identifier for the data contract -display_name: Data Contract for SampleHiveDataset +version: 1 # datahub yaml format version + +# Note: this data contract yaml format is still in development, and will likely +# change in backwards-incompatible ways in the future. + entity: urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD) freshness: - time: 0700 - granularity: DAILY + type: cron + cron: 0 7 * * * # 7am daily + timezone: America/Los_Angeles schema: - properties: - field_foo: - type: string - native_type: VARCHAR(100) - field_bar: - type: boolean - required: - - field_bar + type: json-schema + json-schema: + properties: + field_foo: + type: string + native_type: VARCHAR(100) + field_bar: + type: boolean + required: + - field_bar data_quality: - - type: column_range - config: - column: field_foo - min: 0 - max: 100 + - type: unique + column: field_foo + - type: custom_sql + sql: SELECT COUNT(*) FROM SampleHiveDataset + operator: + type: greater_than + value: 100 From 2bb4b73f98ef46446e8025cd3657289bb24ff0df Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 26 Jan 2024 14:03:16 -0800 Subject: [PATCH 07/14] fix(ingest/metabase): add missing sql parser dep (#9725) --- metadata-ingestion/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 1fb570d76120e8..c1a5da5826ead9 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -340,7 +340,7 @@ "ldap": {"python-ldap>=2.4"}, "looker": looker_common, "lookml": looker_common, - "metabase": {"requests"} | sqllineage_lib, + "metabase": {"requests"} | sqlglot_lib, "mlflow": {"mlflow-skinny>=2.3.0"}, "mode": {"requests", "tenacity>=8.0.1"} | sqllineage_lib, "mongodb": {"pymongo[srv]>=3.11", "packaging"}, From dc16c73937dcb4a287653090faf3c32807257872 Mon Sep 17 00:00:00 2001 From: gaurav2733 <77378510+gaurav2733@users.noreply.github.com> Date: Mon, 29 Jan 2024 15:26:14 +0530 Subject: [PATCH 08/14] feat(ui): include parent term groups, domains in glossary, domain dropdown (#9715) --- .../profile/AddRelatedTermsModal.tsx | 13 ++++- .../EntityDropdown/NodeParentSelect.tsx | 21 +++++-- .../profile/sidebar/Domain/SetDomainModal.tsx | 13 ++++- .../glossary/GloassarySearchResultItem.tsx | 56 +++++++++++++++++++ .../src/app/glossary/GlossarySearch.tsx | 55 ++++++++---------- datahub-web-react/src/app/glossary/utils.ts | 8 ++- .../src/app/shared/DomainLabel.tsx | 2 +- .../src/app/shared/tags/AddTagsTermsModal.tsx | 13 ++++- 8 files changed, 138 insertions(+), 43 deletions(-) create mode 100644 datahub-web-react/src/app/glossary/GloassarySearchResultItem.tsx diff --git a/datahub-web-react/src/app/entity/glossaryTerm/profile/AddRelatedTermsModal.tsx b/datahub-web-react/src/app/entity/glossaryTerm/profile/AddRelatedTermsModal.tsx index 5b303f75e2985a..f97f3c327676b9 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/profile/AddRelatedTermsModal.tsx +++ b/datahub-web-react/src/app/entity/glossaryTerm/profile/AddRelatedTermsModal.tsx @@ -10,11 +10,19 @@ import { BrowserWrapper } from '../../../shared/tags/AddTagsTermsModal'; import TermLabel from '../../../shared/TermLabel'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { useEntityData, useRefetch } from '../../shared/EntityContext'; +import ParentEntities from '../../../search/filters/ParentEntities'; +import { getParentEntities } from '../../../search/filters/utils'; const StyledSelect = styled(Select)` width: 480px; `; +const SearchResultContainer = styled.div` + display: flex; + flex-direction: column; + justify-content: center; +`; + interface Props { onClose: () => void; relationshipType: TermRelationshipType; @@ -68,7 +76,10 @@ function AddRelatedTermsModal(props: Props) { return ( - + + + + ); }); diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/NodeParentSelect.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/NodeParentSelect.tsx index e7f5827e33dcc7..7227354a465695 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/NodeParentSelect.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/NodeParentSelect.tsx @@ -1,5 +1,6 @@ import React from 'react'; import { Select } from 'antd'; +import styled from 'styled-components'; import { EntityType, GlossaryNode, SearchResult } from '../../../../types.generated'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { useEntityData } from '../EntityContext'; @@ -7,6 +8,14 @@ import ClickOutside from '../../../shared/ClickOutside'; import GlossaryBrowser from '../../../glossary/GlossaryBrowser/GlossaryBrowser'; import { BrowserWrapper } from '../../../shared/tags/AddTagsTermsModal'; import useParentSelector from './useParentSelector'; +import ParentEntities from '../../../search/filters/ParentEntities'; +import { getParentGlossary } from '../../../glossary/utils'; + +const SearchResultContainer = styled.div` + display: flex; + flex-direction: column; + justify-content: center; +`; // filter out entity itself and its children export function filterResultsForMove(entity: GlossaryNode, entityUrn: string) { @@ -46,10 +55,9 @@ function NodeParentSelect(props: Props) { setSelectedParentUrn, }); - let nodeSearchResults: SearchResult[] = []; - if (isMoving) { - nodeSearchResults = searchResults.filter((r) => filterResultsForMove(r.entity as GlossaryNode, entityDataUrn)); - } + const nodeSearchResults: SearchResult[] = searchResults.filter((r) => + filterResultsForMove(r.entity as GlossaryNode, entityDataUrn), + ); const isShowingGlossaryBrowser = !searchQuery && isFocusedOnInput; const shouldHideSelf = isMoving && entityType === EntityType.GlossaryNode; @@ -70,7 +78,10 @@ function NodeParentSelect(props: Props) { > {nodeSearchResults?.map((result) => ( - {entityRegistry.getDisplayName(result.entity.type, result.entity)} + + + {entityRegistry.getDisplayName(result.entity.type, result.entity)} + ))} diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/Domain/SetDomainModal.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/Domain/SetDomainModal.tsx index 9b512d2d679e94..ab63553c6376b8 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/Domain/SetDomainModal.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/Domain/SetDomainModal.tsx @@ -16,6 +16,8 @@ import DomainNavigator from '../../../../../../domain/nestedDomains/domainNaviga import ClickOutside from '../../../../../../shared/ClickOutside'; import { ANTD_GRAY } from '../../../../constants'; import { getModalDomContainer } from '../../../../../../../utils/focus'; +import ParentEntities from '../../../../../../search/filters/ParentEntities'; +import { getParentDomains } from '../../../../../../domain/utils'; type Props = { urns: string[]; @@ -44,6 +46,12 @@ const LoadingWrapper = styled.div` } `; +const SearchResultContainer = styled.div` + display: flex; + flex-direction: column; + justify-content: center; +`; + export const SetDomainModal = ({ urns, onCloseModal, refetch, defaultValue, onOkOverride, titleOverride }: Props) => { const entityRegistry = useEntityRegistry(); const [isFocusedOnInput, setIsFocusedOnInput] = useState(false); @@ -88,7 +96,10 @@ export const SetDomainModal = ({ urns, onCloseModal, refetch, defaultValue, onOk const displayName = entityRegistry.getDisplayName(entity.type, entity); return ( - + + + + ); }; diff --git a/datahub-web-react/src/app/glossary/GloassarySearchResultItem.tsx b/datahub-web-react/src/app/glossary/GloassarySearchResultItem.tsx new file mode 100644 index 00000000000000..03a384aab4bd52 --- /dev/null +++ b/datahub-web-react/src/app/glossary/GloassarySearchResultItem.tsx @@ -0,0 +1,56 @@ +// Create a new component called SearchResultItem.js +import React from 'react'; +import { Link } from 'react-router-dom'; +import Highlight from 'react-highlighter'; +import styled from 'styled-components/macro'; +import { Entity } from '../../types.generated'; +import { IconStyleType } from '../entity/Entity'; +import { ANTD_GRAY } from '../entity/shared/constants'; +import ParentEntities from '../search/filters/ParentEntities'; +import { getParentGlossary } from './utils'; +import EntityRegistry from '../entity/EntityRegistry'; + +type Props = { + entity: Entity; + entityRegistry: EntityRegistry; + query: string; + onResultClick: () => void; +}; + +const SearchResult = styled(Link)` + color: #262626; + display: flex; + align-items: center; + gap: 8px; + height: 100%; + padding: 6px 8px; + width: 100%; + &:hover { + background-color: ${ANTD_GRAY[3]}; + color: #262626; + } +`; + +const IconWrapper = styled.span``; + +const highlightMatchStyle = { + fontWeight: 'bold', + background: 'none', + padding: 0, +}; + +function GlossarySearchResultItem({ entity, entityRegistry, query, onResultClick }: Props) { + return ( + + {entityRegistry.getIcon(entity.type, 12, IconStyleType.TAB_VIEW)} +
+ + + {entityRegistry.getDisplayName(entity.type, entity)} + +
+
+ ); +} + +export default GlossarySearchResultItem; diff --git a/datahub-web-react/src/app/glossary/GlossarySearch.tsx b/datahub-web-react/src/app/glossary/GlossarySearch.tsx index 75cd3b10d581e8..321c218c38fe33 100644 --- a/datahub-web-react/src/app/glossary/GlossarySearch.tsx +++ b/datahub-web-react/src/app/glossary/GlossarySearch.tsx @@ -1,13 +1,12 @@ import React, { useState } from 'react'; -import { Link } from 'react-router-dom'; import styled from 'styled-components/macro'; import { useGetSearchResultsForMultipleQuery } from '../../graphql/search.generated'; import { EntityType } from '../../types.generated'; -import { IconStyleType } from '../entity/Entity'; import { ANTD_GRAY } from '../entity/shared/constants'; import { SearchBar } from '../search/SearchBar'; import ClickOutside from '../shared/ClickOutside'; import { useEntityRegistry } from '../useEntityRegistry'; +import GloassarySearchResultItem from './GloassarySearchResultItem'; const GlossarySearchWrapper = styled.div` position: relative; @@ -28,20 +27,10 @@ const ResultsWrapper = styled.div` top: 45px; `; -const SearchResult = styled(Link)` - color: #262626; - display: inline-block; - height: 100%; - padding: 6px 8px; - width: 100%; - &:hover { - background-color: ${ANTD_GRAY[3]}; - color: #262626; - } -`; - -const IconWrapper = styled.span` - margin-right: 8px; +const TermNodeName = styled.span` + margin-top: 12px; + color: ${ANTD_GRAY[8]}; + font-weight: bold; `; function GlossarySearch() { @@ -63,6 +52,21 @@ function GlossarySearch() { const searchResults = data?.searchAcrossEntities?.searchResults; + const renderSearchResults = () => ( + + Glossary Terms + {searchResults?.map((result) => ( + setIsSearchBarFocused(false)} + /> + ))} + + ); + return ( setIsSearchBarFocused(false)}> @@ -84,23 +88,8 @@ function GlossarySearch() { entityRegistry={entityRegistry} onFocus={() => setIsSearchBarFocused(true)} /> - {isSearchBarFocused && searchResults && !!searchResults.length && ( - - {searchResults.map((result) => { - return ( - setIsSearchBarFocused(false)} - > - - {entityRegistry.getIcon(result.entity.type, 12, IconStyleType.ACCENT)} - - {entityRegistry.getDisplayName(result.entity.type, result.entity)} - - ); - })} - - )} + {isSearchBarFocused && searchResults && !!searchResults.length && renderSearchResults()} + ); diff --git a/datahub-web-react/src/app/glossary/utils.ts b/datahub-web-react/src/app/glossary/utils.ts index 60f71d7b2f9ef1..4cfbb06b8a4f3a 100644 --- a/datahub-web-react/src/app/glossary/utils.ts +++ b/datahub-web-react/src/app/glossary/utils.ts @@ -1,4 +1,5 @@ -import { EntityType } from '../../types.generated'; +import { Entity, EntityType } from '../../types.generated'; +import EntityRegistry from '../entity/EntityRegistry'; import { GenericEntityProperties } from '../entity/shared/types'; export const ROOT_NODES = 'rootNodes'; @@ -25,3 +26,8 @@ export function updateGlossarySidebar( ) { setUrnsToUpdate([...urnsToUpdate, ...parentNodesToUpdate]); } + +export function getParentGlossary(node: T, entityRegistry: EntityRegistry) { + const props = entityRegistry.getGenericEntityProperties(EntityType.GlossaryNode, node); + return props?.parentNodes?.nodes ?? []; +} diff --git a/datahub-web-react/src/app/shared/DomainLabel.tsx b/datahub-web-react/src/app/shared/DomainLabel.tsx index 40208026d4369f..f71975b23e5179 100644 --- a/datahub-web-react/src/app/shared/DomainLabel.tsx +++ b/datahub-web-react/src/app/shared/DomainLabel.tsx @@ -5,7 +5,7 @@ const DomainContainerWrapper = styled.div` display: flex; justify-content: space-between; align-items: center; - padding: 12px; + padding: 2px; `; const DomainContentWrapper = styled.div` diff --git a/datahub-web-react/src/app/shared/tags/AddTagsTermsModal.tsx b/datahub-web-react/src/app/shared/tags/AddTagsTermsModal.tsx index d486ee02dae3eb..73e99c319441d6 100644 --- a/datahub-web-react/src/app/shared/tags/AddTagsTermsModal.tsx +++ b/datahub-web-react/src/app/shared/tags/AddTagsTermsModal.tsx @@ -20,6 +20,8 @@ import { FORBIDDEN_URN_CHARS_REGEX, handleBatchError } from '../../entity/shared import { TagTermLabel } from './TagTermLabel'; import { ENTER_KEY_CODE } from '../constants'; import { getModalDomContainer } from '../../../utils/focus'; +import ParentEntities from '../../search/filters/ParentEntities'; +import { getParentEntities } from '../../search/filters/utils'; export enum OperationType { ADD, @@ -69,6 +71,12 @@ export const BrowserWrapper = styled.div<{ isHidden: boolean; width?: string; ma `} `; +const SearchResultContainer = styled.div` + display: flex; + flex-direction: column; + justify-content: center; +`; + const CREATE_TAG_VALUE = '____reserved____.createTagValue'; const isValidTagName = (tagName: string) => { @@ -139,7 +147,10 @@ export default function EditTagTermsModal({ const tagOrTermComponent = ; return ( - {tagOrTermComponent} + + + {tagOrTermComponent} + ); }; From 90c88082b11cdfb6252eaebf11737887a38a0ee3 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Mon, 29 Jan 2024 14:14:34 +0100 Subject: [PATCH 09/14] fix(ingest/s3): Not sorting schema fields to keep original order (#9349) --- .../source/data_lake_common/path_spec.py | 20 +++++++++++++++---- .../src/datahub/ingestion/source/s3/config.py | 5 +++++ .../src/datahub/ingestion/source/s3/source.py | 3 ++- .../ingestion/source/schema_inference/json.py | 2 +- .../unit/data_lake/test_schema_inference.py | 16 +++++---------- 5 files changed, 29 insertions(+), 17 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py index 05b1b6b7cc0403..a4b3779b73803a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py @@ -63,6 +63,11 @@ class Config: description="Not listing all the files but only taking a handful amount of sample file to infer the schema. File count and file size calculation will be disabled. This can affect performance significantly if enabled", ) + allow_double_stars: bool = Field( + default=False, + description="Allow double stars in the include path. This can affect performance significantly if enabled", + ) + def allowed(self, path: str) -> bool: logger.debug(f"Checking file to inclusion: {path}") if not pathlib.PurePath(path).globmatch( @@ -126,11 +131,18 @@ def get_parsable_include(cls, include: str) -> str: def get_named_vars(self, path: str) -> Union[None, parse.Result, parse.Match]: return self.compiled_include.parse(path) - @pydantic.validator("include") - def validate_no_double_stars(cls, v: str) -> str: - if "**" in v: + @pydantic.root_validator() + def validate_no_double_stars(cls, values: Dict) -> Dict: + if "include" not in values: + return values + + if ( + values.get("include") + and "**" in values["include"] + and not values.get("allow_double_stars") + ): raise ValueError("path_spec.include cannot contain '**'") - return v + return values @pydantic.validator("file_types", always=True) def validate_file_types(cls, v: Optional[List[str]]) -> List[str]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py index f752a33b42d9c5..55e340e2850d55 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py @@ -93,6 +93,11 @@ class DataLakeSourceConfig( "path_spec", "path_specs", lambda path_spec: [path_spec] ) + sort_schema_fields: bool = Field( + default=False, + description="Whether to sort schema fields by fieldPath when inferring schemas.", + ) + def is_profiling_enabled(self) -> bool: return self.profiling.enabled and is_profiling_enabled( self.profiling.operation_config diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 94c571eabad11a..41fc5782352c94 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -458,7 +458,8 @@ def get_fields(self, table_data: TableData, path_spec: PathSpec) -> List: ) file.close() logger.debug(f"Extracted fields in schema: {fields}") - fields = sorted(fields, key=lambda f: f.fieldPath) + if self.source_config.sort_schema_fields: + fields = sorted(fields, key=lambda f: f.fieldPath) if self.source_config.add_partition_columns_to_schema: self.add_partition_columns_to_schema( diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py index c53c64be4cba80..251d136fe92ee7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py @@ -48,7 +48,7 @@ def infer_schema(self, file: IO[bytes]) -> List[SchemaField]: schema = construct_schema(datastore, delimiter=".") fields: List[SchemaField] = [] - for schema_field in sorted(schema.values(), key=lambda x: x["delimited_name"]): + for schema_field in schema.values(): mapped_type = _field_type_mapping.get(schema_field["type"], NullTypeClass) native_type = schema_field["type"] diff --git a/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py b/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py index 4a69deb572fbd7..de88deec9b9cb0 100644 --- a/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py +++ b/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py @@ -18,23 +18,23 @@ from tests.unit.test_schema_util import assert_field_paths_match expected_field_paths = [ - "boolean_field", "integer_field", + "boolean_field", "string_field", ] expected_field_paths_avro = [ - "[version=2.0].[type=test].[type=boolean].boolean_field", "[version=2.0].[type=test].[type=int].integer_field", + "[version=2.0].[type=test].[type=boolean].boolean_field", "[version=2.0].[type=test].[type=string].string_field", ] -expected_field_types = [BooleanTypeClass, NumberTypeClass, StringTypeClass] +expected_field_types = [NumberTypeClass, BooleanTypeClass, StringTypeClass] test_table = pd.DataFrame( { - "boolean_field": [True, False, True], "integer_field": [1, 2, 3], + "boolean_field": [True, False, True], "string_field": ["a", "b", "c"], } ) @@ -54,7 +54,6 @@ def test_infer_schema_csv(): file.seek(0) fields = csv_tsv.CsvInferrer(max_rows=100).infer_schema(file) - fields.sort(key=lambda x: x.fieldPath) assert_field_paths_match(fields, expected_field_paths) assert_field_types_match(fields, expected_field_types) @@ -70,7 +69,6 @@ def test_infer_schema_tsv(): file.seek(0) fields = csv_tsv.TsvInferrer(max_rows=100).infer_schema(file) - fields.sort(key=lambda x: x.fieldPath) assert_field_paths_match(fields, expected_field_paths) assert_field_types_match(fields, expected_field_types) @@ -82,7 +80,6 @@ def test_infer_schema_json(): file.seek(0) fields = json.JsonInferrer().infer_schema(file) - fields.sort(key=lambda x: x.fieldPath) assert_field_paths_match(fields, expected_field_paths) assert_field_types_match(fields, expected_field_types) @@ -92,9 +89,7 @@ def test_infer_schema_parquet(): with tempfile.TemporaryFile(mode="w+b") as file: test_table.to_parquet(file) file.seek(0) - fields = parquet.ParquetInferrer().infer_schema(file) - fields.sort(key=lambda x: x.fieldPath) assert_field_paths_match(fields, expected_field_paths) assert_field_types_match(fields, expected_field_types) @@ -108,8 +103,8 @@ def test_infer_schema_avro(): "type": "record", "name": "test", "fields": [ - {"name": "boolean_field", "type": "boolean"}, {"name": "integer_field", "type": "int"}, + {"name": "boolean_field", "type": "boolean"}, {"name": "string_field", "type": "string"}, ], } @@ -124,7 +119,6 @@ def test_infer_schema_avro(): file.seek(0) fields = AvroInferrer().infer_schema(file) - fields.sort(key=lambda x: x.fieldPath) assert_field_paths_match(fields, expected_field_paths_avro) assert_field_types_match(fields, expected_field_types) From 5735eb3a55f49c966d68b4bfca95b1965b34292b Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Mon, 29 Jan 2024 15:12:30 +0100 Subject: [PATCH 10/14] fix(ingest/test): Fixing breaking change in moto 5.0 library (#9736) --- metadata-ingestion/setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index c1a5da5826ead9..af2b54ba1cefa5 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -234,7 +234,8 @@ # ujson 5.2.0 has the JSONDecodeError exception type, which we need for error handling. "ujson>=5.2.0", "smart-open[s3]>=5.2.1", - "moto[s3]", + # moto 5.0.0 drops support for Python 3.7 + "moto[s3]<5.0.0", *path_spec_common, } From fdf929b3f4284753fef9ff59b5018134b874c56b Mon Sep 17 00:00:00 2001 From: Shirshanka Das Date: Mon, 29 Jan 2024 06:14:34 -0800 Subject: [PATCH 11/14] build(graphql): simplify gradle graphql codegen task (#9734) --- datahub-graphql-core/build.gradle | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/datahub-graphql-core/build.gradle b/datahub-graphql-core/build.gradle index f273a4dd0eea5f..fe70f2622490d9 100644 --- a/datahub-graphql-core/build.gradle +++ b/datahub-graphql-core/build.gradle @@ -31,30 +31,16 @@ dependencies { graphqlCodegen { // For options: https://github.com/kobylynskyi/graphql-java-codegen/blob/master/docs/codegen-options.md - graphqlSchemaPaths = [ - "$projectDir/src/main/resources/entity.graphql".toString(), - "$projectDir/src/main/resources/app.graphql".toString(), - "$projectDir/src/main/resources/search.graphql".toString(), - "$projectDir/src/main/resources/analytics.graphql".toString(), - "$projectDir/src/main/resources/recommendation.graphql".toString(), - "$projectDir/src/main/resources/ingestion.graphql".toString(), - "$projectDir/src/main/resources/auth.graphql".toString(), - "$projectDir/src/main/resources/timeline.graphql".toString(), - "$projectDir/src/main/resources/tests.graphql".toString(), - "$projectDir/src/main/resources/properties.graphql".toString(), - "$projectDir/src/main/resources/step.graphql".toString(), - "$projectDir/src/main/resources/lineage.graphql".toString(), - "$projectDir/src/main/resources/forms.graphql".toString() - ] - outputDir = new File("$projectDir/src/mainGeneratedGraphQL/java") + graphqlSchemaPaths = fileTree(dir: "${projectDir}/src/main/resources", include: '**/*.graphql').collect { it.absolutePath } + outputDir = new File("${projectDir}/src/mainGeneratedGraphQL/java") packageName = "com.linkedin.datahub.graphql.generated" generateToString = true generateApis = true generateParameterizedFieldsResolvers = false modelValidationAnnotation = "@javax.annotation.Nonnull" customTypesMapping = [ - Long: "Long", - Float: "Float" + Long: "Long", + Float: "Float" ] } From f3cc4e068a51c0124f1b4dc55713ddd5344ebcb8 Mon Sep 17 00:00:00 2001 From: gaurav2733 <77378510+gaurav2733@users.noreply.github.com> Date: Mon, 29 Jan 2024 21:56:16 +0530 Subject: [PATCH 12/14] feat(ui/secret): support to edit secrets (#9737) --- .../app/ingest/secret/SecretBuilderModal.tsx | 73 ++++++++++++++----- .../src/app/ingest/secret/SecretsList.tsx | 69 +++++++++++++++++- .../src/app/ingest/secret/cacheUtils.ts | 45 ++++++++++++ .../src/app/ingest/secret/types.ts | 4 + 4 files changed, 170 insertions(+), 21 deletions(-) diff --git a/datahub-web-react/src/app/ingest/secret/SecretBuilderModal.tsx b/datahub-web-react/src/app/ingest/secret/SecretBuilderModal.tsx index c099d9a580efab..2d20ac77891ea0 100644 --- a/datahub-web-react/src/app/ingest/secret/SecretBuilderModal.tsx +++ b/datahub-web-react/src/app/ingest/secret/SecretBuilderModal.tsx @@ -1,5 +1,5 @@ import { Button, Form, Input, Modal, Typography } from 'antd'; -import React, { useState } from 'react'; +import React, { useEffect, useState } from 'react'; import { useEnterKeyListener } from '../../shared/useEnterKeyListener'; import { SecretBuilderState } from './types'; @@ -9,12 +9,14 @@ const VALUE_FIELD_NAME = 'value'; type Props = { initialState?: SecretBuilderState; + editSecret?: SecretBuilderState; visible: boolean; onSubmit?: (source: SecretBuilderState, resetState: () => void) => void; + onUpdate?: (source: SecretBuilderState, resetState: () => void) => void; onCancel?: () => void; }; -export const SecretBuilderModal = ({ initialState, visible, onSubmit, onCancel }: Props) => { +export const SecretBuilderModal = ({ initialState, editSecret, visible, onSubmit, onUpdate, onCancel }: Props) => { const [createButtonEnabled, setCreateButtonEnabled] = useState(false); const [form] = Form.useForm(); @@ -23,38 +25,69 @@ export const SecretBuilderModal = ({ initialState, visible, onSubmit, onCancel } querySelectorToExecuteClick: '#createSecretButton', }); + useEffect(() => { + if (editSecret) { + form.setFieldsValue({ + name: editSecret.name, + description: editSecret.description, + value: editSecret.value, + }); + } + }, [editSecret, form]); + function resetValues() { + setCreateButtonEnabled(false); form.resetFields(); } + const onCloseModal = () => { + setCreateButtonEnabled(false); + form.resetFields(); + onCancel?.(); + }; + + const titleText = editSecret ? 'Edit Secret' : 'Create a new Secret'; + return ( Create a new Secret} + title={{titleText}} visible={visible} - onCancel={onCancel} + onCancel={onCloseModal} zIndex={1051} // one higher than other modals - needed for managed ingestion forms footer={ <> - } @@ -81,11 +114,15 @@ export const SecretBuilderModal = ({ initialState, visible, onSubmit, onCancel } }, { whitespace: false }, { min: 1, max: 50 }, - { pattern: /^[a-zA-Z_]+[a-zA-Z0-9_]*$/, message: 'Please start the secret name with a letter, followed by letters, digits, or underscores only.' }, + { + pattern: /^[a-zA-Z_]+[a-zA-Z0-9_]*$/, + message: + 'Please start the secret name with a letter, followed by letters, digits, or underscores only.', + }, ]} hasFeedback > - + Value}> diff --git a/datahub-web-react/src/app/ingest/secret/SecretsList.tsx b/datahub-web-react/src/app/ingest/secret/SecretsList.tsx index 1a960997e6beeb..2219b6147d9e06 100644 --- a/datahub-web-react/src/app/ingest/secret/SecretsList.tsx +++ b/datahub-web-react/src/app/ingest/secret/SecretsList.tsx @@ -9,6 +9,7 @@ import { useCreateSecretMutation, useDeleteSecretMutation, useListSecretsQuery, + useUpdateSecretMutation, } from '../../../graphql/ingestion.generated'; import { Message } from '../../shared/Message'; import TabToolbar from '../../entity/shared/components/styled/TabToolbar'; @@ -18,7 +19,11 @@ import { StyledTable } from '../../entity/shared/components/styled/StyledTable'; import { SearchBar } from '../../search/SearchBar'; import { useEntityRegistry } from '../../useEntityRegistry'; import { scrollToTop } from '../../shared/searchUtils'; -import { addSecretToListSecretsCache, removeSecretFromListSecretsCache } from './cacheUtils'; +import { + addSecretToListSecretsCache, + removeSecretFromListSecretsCache, + updateSecretInListSecretsCache, +} from './cacheUtils'; import { ONE_SECOND_IN_MS } from '../../entity/shared/tabs/Dataset/Queries/utils/constants'; const DeleteButtonContainer = styled.div` @@ -48,10 +53,12 @@ export const SecretsList = () => { // Whether or not there is an urn to show in the modal const [isCreatingSecret, setIsCreatingSecret] = useState(false); + const [editSecret, setEditSecret] = useState(undefined); const [deleteSecretMutation] = useDeleteSecretMutation(); const [createSecretMutation] = useCreateSecretMutation(); - const { loading, error, data, client } = useListSecretsQuery({ + const [updateSecretMutation] = useUpdateSecretMutation(); + const { loading, error, data, client, refetch } = useListSecretsQuery({ variables: { input: { start, @@ -125,6 +132,47 @@ export const SecretsList = () => { }); }); }; + const onUpdate = (state: SecretBuilderState, resetBuilderState: () => void) => { + updateSecretMutation({ + variables: { + input: { + urn: state.urn as string, + name: state.name as string, + value: state.value as string, + description: state.description as string, + }, + }, + }) + .then(() => { + message.success({ + content: `Successfully updated Secret!`, + duration: 3, + }); + resetBuilderState(); + setIsCreatingSecret(false); + setEditSecret(undefined); + updateSecretInListSecretsCache( + { + urn: state.urn, + name: state.name, + description: state.description, + }, + client, + pageSize, + page, + ); + setTimeout(() => { + refetch(); + }, 2000); + }) + .catch((e) => { + message.destroy(); + message.error({ + content: `Failed to update Secret!: \n ${e.message || ''}`, + duration: 3, + }); + }); + }; const onDeleteSecret = (urn: string) => { Modal.confirm({ @@ -140,6 +188,16 @@ export const SecretsList = () => { }); }; + const onEditSecret = (urnData: any) => { + setIsCreatingSecret(true); + setEditSecret(urnData); + }; + + const onCancel = () => { + setIsCreatingSecret(false); + setEditSecret(undefined); + }; + const tableColumns = [ { title: 'Name', @@ -161,6 +219,9 @@ export const SecretsList = () => { key: 'x', render: (_, record: any) => ( + @@ -234,8 +295,10 @@ export const SecretsList = () => { setIsCreatingSecret(false)} + onCancel={onCancel} /> ); diff --git a/datahub-web-react/src/app/ingest/secret/cacheUtils.ts b/datahub-web-react/src/app/ingest/secret/cacheUtils.ts index 72e287f8846edb..b3a3a45f33892c 100644 --- a/datahub-web-react/src/app/ingest/secret/cacheUtils.ts +++ b/datahub-web-react/src/app/ingest/secret/cacheUtils.ts @@ -64,6 +64,51 @@ export const addSecretToListSecretsCache = (secret, client, pageSize) => { }); }; +export const updateSecretInListSecretsCache = (updatedSecret, client, pageSize, page) => { + const currData: ListSecretsQuery | null = client.readQuery({ + query: ListSecretsDocument, + variables: { + input: { + start: (page - 1) * pageSize, + count: pageSize, + }, + }, + }); + + const updatedSecretIndex = (currData?.listSecrets?.secrets || []) + .map((secret, index) => { + if (secret.urn === updatedSecret.urn) { + return index; + } + return -1; + }) + .find((index) => index !== -1); + + if (updatedSecretIndex !== undefined) { + const newSecrets = (currData?.listSecrets?.secrets || []).map((secret, index) => { + return index === updatedSecretIndex ? updatedSecret : secret; + }); + + client.writeQuery({ + query: ListSecretsDocument, + variables: { + input: { + start: (page - 1) * pageSize, + count: pageSize, + }, + }, + data: { + listSecrets: { + start: currData?.listSecrets?.start || 0, + count: currData?.listSecrets?.count || 1, + total: currData?.listSecrets?.total || 1, + secrets: newSecrets, + }, + }, + }); + } +}; + export const clearSecretListCache = (client) => { // Remove any caching of 'listSecrets' client.cache.evict({ id: 'ROOT_QUERY', fieldName: 'listSecrets' }); diff --git a/datahub-web-react/src/app/ingest/secret/types.ts b/datahub-web-react/src/app/ingest/secret/types.ts index 23e45cab9b1790..e0dbc8d443d9bb 100644 --- a/datahub-web-react/src/app/ingest/secret/types.ts +++ b/datahub-web-react/src/app/ingest/secret/types.ts @@ -2,6 +2,10 @@ * The object represents the state of the Ingestion Source Builder form. */ export interface SecretBuilderState { + /** + * The name of the secret. + */ + urn?: string; /** * The name of the secret. */ From 1498c36875450b1a1f44d53e8e8c47c41a91dc69 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 29 Jan 2024 10:50:47 -0800 Subject: [PATCH 13/14] chore(cli): drop support for python 3.7 (#9731) --- .github/workflows/metadata-ingestion.yml | 4 +- docs/cli.md | 2 +- docs/how/updating-datahub.md | 11 +- docs/quickstart.md | 2 +- .../airflow-plugin/setup.py | 14 +- .../airflow-plugin/tests/unit/test_airflow.py | 204 ++-- metadata-ingestion/build.gradle | 2 +- metadata-ingestion/cli-ingestion.md | 21 +- metadata-ingestion/developing.md | 6 +- metadata-ingestion/setup.py | 23 +- metadata-ingestion/src/datahub/__init__.py | 11 +- .../src/datahub/ingestion/api/report.py | 21 +- .../src/datahub/ingestion/source/feast.py | 5 - .../ingestion/source/iceberg/iceberg.py | 5 - .../src/datahub/ingestion/source/mlflow.py | 6 - .../source/schema_inference/object.py | 2 +- .../feast/test_feast_repository.py | 7 - .../tests/integration/iceberg/test_iceberg.py | 9 +- .../integration/mlflow/test_mlflow_source.py | 184 ++-- .../integration/sql_server/test_sql_server.py | 5 - metadata-ingestion/tests/unit/test_iceberg.py | 899 +++++++++--------- .../tests/unit/test_mlflow_source.py | 225 ++--- 22 files changed, 805 insertions(+), 863 deletions(-) diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index 1da08b14b8b5b2..e7d6b7b97c0993 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -31,7 +31,7 @@ jobs: # DATAHUB_LOOKML_GIT_TEST_SSH_KEY: ${{ secrets.DATAHUB_LOOKML_GIT_TEST_SSH_KEY }} strategy: matrix: - python-version: ["3.7", "3.10"] + python-version: ["3.8", "3.10"] command: [ "testQuick", @@ -40,7 +40,7 @@ jobs: "testIntegrationBatch2", ] include: - - python-version: "3.7" + - python-version: "3.8" - python-version: "3.10" fail-fast: false steps: diff --git a/docs/cli.md b/docs/cli.md index cb5077db429061..927270b42259d4 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -24,7 +24,7 @@ source venv/bin/activate # activate the environment Once inside the virtual environment, install `datahub` using the following commands ```shell -# Requires Python 3.7+ +# Requires Python 3.8+ python3 -m pip install --upgrade pip wheel setuptools python3 -m pip install --upgrade acryl-datahub # validate that the install was successful diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index b671e2fc5d123c..6b6903b04f383e 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -10,8 +10,10 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - Neo4j 5.x, may require migration from 4.x - Build requires JDK17 (Runtime Java 11) - Build requires Docker Compose > 2.20 +- #9731 - The `acryl-datahub` CLI now requires Python 3.8+ - #9601 - The Unity Catalog(UC) ingestion source config `include_metastore` is now disabled by default. This change will affect the urns of all entities in the workspace.
- Entity Hierarchy with `include_metastore: true` (Old) + Entity Hierarchy with `include_metastore: true` (Old) + ``` - UC Metastore - Catalog @@ -19,16 +21,19 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - Table ``` - Entity Hierarchy with `include_metastore: false` (New) + Entity Hierarchy with `include_metastore: false` (New) + ``` - Catalog - Schema - Table ``` + We recommend using `platform_instance` for differentiating across metastores. If stateful ingestion is enabled, running ingestion with latest cli version will perform all required cleanup. Otherwise, we recommend soft deleting all databricks data via the DataHub CLI: - `datahub delete --platform databricks --soft` and then reingesting with latest cli version. + `datahub delete --platform databricks --soft` and then reingesting with latest cli version. + - #9601 - The Unity Catalog(UC) ingestion source config `include_hive_metastore` is now enabled by default. This requires config `warehouse_id` to be set. You can disable `include_hive_metastore` by setting it to `False` to avoid ingesting legacy hive metastore catalog in Databricks. ### Potential Downtime diff --git a/docs/quickstart.md b/docs/quickstart.md index 5856ef84c0074e..507be6ba05471a 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -22,7 +22,7 @@ If you're interested in a managed version, [Acryl Data](https://www.acryldata.io | Linux | [Docker for Linux](https://docs.docker.com/desktop/install/linux-install/) and [Docker Compose](https://docs.docker.com/compose/install/linux/) | - **Launch the Docker engine** from command line or the desktop app. -- Ensure you have **Python 3.7+** installed & configured. (Check using `python3 --version`). +- Ensure you have **Python 3.8+** installed & configured. (Check using `python3 --version`). :::note Docker Resource Allocation diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 838322f83833bb..1a3e844cedc1ff 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -18,16 +18,10 @@ def get_long_description(): _self_pin = f"=={_version}" if not _version.endswith("dev0") else "" -rest_common = {"requests", "requests_file"} - base_requirements = { - # Compatibility. - "dataclasses>=0.6; python_version < '3.7'", - "mypy_extensions>=0.4.3", + f"acryl-datahub[datahub-rest]{_self_pin}", # Actual dependencies. - "pydantic>=1.5.1", "apache-airflow >= 2.0.2", - *rest_common, } plugins: Dict[str, Set[str]] = { @@ -42,9 +36,8 @@ def get_long_description(): }, "plugin-v1": set(), "plugin-v2": { - # The v2 plugin requires Python 3.8+. f"acryl-datahub[sql-parser]{_self_pin}", - "openlineage-airflow==1.2.0; python_version >= '3.8'", + "openlineage-airflow==1.2.0", }, } @@ -144,7 +137,6 @@ def get_long_description(): "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", @@ -161,7 +153,7 @@ def get_long_description(): ], # Package info. zip_safe=False, - python_requires=">=3.7", + python_requires=">=3.8", package_data={ "datahub_airflow_plugin": ["py.typed"], }, diff --git a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py index 93b4af0501985e..b484713e18faf6 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py @@ -1,7 +1,6 @@ import datetime import json import os -import sys from contextlib import contextmanager from typing import Iterator from unittest import mock @@ -318,137 +317,134 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions): # Check that the right things were emitted. assert mock_emitter.emit.call_count == 17 if capture_executions else 9 - # Running further checks based on python version because args only exists in python 3.8+ - if sys.version_info > (3, 8): - assert mock_emitter.method_calls[0].args[0].aspectName == "dataFlowInfo" + # TODO: Replace this with a golden file-based comparison. + assert mock_emitter.method_calls[0].args[0].aspectName == "dataFlowInfo" + assert ( + mock_emitter.method_calls[0].args[0].entityUrn + == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" + ) + + assert mock_emitter.method_calls[1].args[0].aspectName == "ownership" + assert ( + mock_emitter.method_calls[1].args[0].entityUrn + == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" + ) + + assert mock_emitter.method_calls[2].args[0].aspectName == "globalTags" + assert ( + mock_emitter.method_calls[2].args[0].entityUrn + == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" + ) + + assert mock_emitter.method_calls[3].args[0].aspectName == "dataJobInfo" + assert ( + mock_emitter.method_calls[3].args[0].entityUrn + == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" + ) + + assert mock_emitter.method_calls[4].args[0].aspectName == "dataJobInputOutput" + assert ( + mock_emitter.method_calls[4].args[0].entityUrn + == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" + ) + assert ( + mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[0] + == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task1_upstream)" + ) + assert ( + mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[1] + == "urn:li:dataJob:(urn:li:dataFlow:(airflow,testDag,PROD),testTask)" + ) + assert ( + mock_emitter.method_calls[4].args[0].aspect.inputDatasets[0] + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" + ) + assert ( + mock_emitter.method_calls[4].args[0].aspect.outputDatasets[0] + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" + ) + + assert mock_emitter.method_calls[5].args[0].aspectName == "status" + assert ( + mock_emitter.method_calls[5].args[0].entityUrn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" + ) + + assert mock_emitter.method_calls[6].args[0].aspectName == "status" + assert ( + mock_emitter.method_calls[6].args[0].entityUrn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" + ) + + assert mock_emitter.method_calls[7].args[0].aspectName == "ownership" + assert ( + mock_emitter.method_calls[7].args[0].entityUrn + == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" + ) + + assert mock_emitter.method_calls[8].args[0].aspectName == "globalTags" + assert ( + mock_emitter.method_calls[8].args[0].entityUrn + == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" + ) + + if capture_executions: assert ( - mock_emitter.method_calls[0].args[0].entityUrn - == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" + mock_emitter.method_calls[9].args[0].aspectName + == "dataProcessInstanceProperties" ) - - assert mock_emitter.method_calls[1].args[0].aspectName == "ownership" assert ( - mock_emitter.method_calls[1].args[0].entityUrn - == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" + mock_emitter.method_calls[9].args[0].entityUrn + == "urn:li:dataProcessInstance:5e274228107f44cc2dd7c9782168cc29" ) - assert mock_emitter.method_calls[2].args[0].aspectName == "globalTags" assert ( - mock_emitter.method_calls[2].args[0].entityUrn - == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" + mock_emitter.method_calls[10].args[0].aspectName + == "dataProcessInstanceRelationships" ) - - assert mock_emitter.method_calls[3].args[0].aspectName == "dataJobInfo" assert ( - mock_emitter.method_calls[3].args[0].entityUrn - == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" + mock_emitter.method_calls[10].args[0].entityUrn + == "urn:li:dataProcessInstance:5e274228107f44cc2dd7c9782168cc29" ) - assert ( - mock_emitter.method_calls[4].args[0].aspectName == "dataJobInputOutput" + mock_emitter.method_calls[11].args[0].aspectName + == "dataProcessInstanceInput" ) assert ( - mock_emitter.method_calls[4].args[0].entityUrn - == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" + mock_emitter.method_calls[11].args[0].entityUrn + == "urn:li:dataProcessInstance:5e274228107f44cc2dd7c9782168cc29" ) assert ( - mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[0] - == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task1_upstream)" + mock_emitter.method_calls[12].args[0].aspectName + == "dataProcessInstanceOutput" ) assert ( - mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[1] - == "urn:li:dataJob:(urn:li:dataFlow:(airflow,testDag,PROD),testTask)" + mock_emitter.method_calls[12].args[0].entityUrn + == "urn:li:dataProcessInstance:5e274228107f44cc2dd7c9782168cc29" ) + assert mock_emitter.method_calls[13].args[0].aspectName == "status" assert ( - mock_emitter.method_calls[4].args[0].aspect.inputDatasets[0] + mock_emitter.method_calls[13].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" ) + assert mock_emitter.method_calls[14].args[0].aspectName == "status" assert ( - mock_emitter.method_calls[4].args[0].aspect.outputDatasets[0] + mock_emitter.method_calls[14].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" ) - - assert mock_emitter.method_calls[5].args[0].aspectName == "status" assert ( - mock_emitter.method_calls[5].args[0].entityUrn - == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" + mock_emitter.method_calls[15].args[0].aspectName + == "dataProcessInstanceRunEvent" ) - - assert mock_emitter.method_calls[6].args[0].aspectName == "status" assert ( - mock_emitter.method_calls[6].args[0].entityUrn - == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" + mock_emitter.method_calls[15].args[0].entityUrn + == "urn:li:dataProcessInstance:5e274228107f44cc2dd7c9782168cc29" ) - - assert mock_emitter.method_calls[7].args[0].aspectName == "ownership" assert ( - mock_emitter.method_calls[7].args[0].entityUrn - == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" + mock_emitter.method_calls[16].args[0].aspectName + == "dataProcessInstanceRunEvent" ) - - assert mock_emitter.method_calls[8].args[0].aspectName == "globalTags" assert ( - mock_emitter.method_calls[8].args[0].entityUrn - == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" + mock_emitter.method_calls[16].args[0].entityUrn + == "urn:li:dataProcessInstance:5e274228107f44cc2dd7c9782168cc29" ) - - if capture_executions: - assert ( - mock_emitter.method_calls[9].args[0].aspectName - == "dataProcessInstanceProperties" - ) - assert ( - mock_emitter.method_calls[9].args[0].entityUrn - == "urn:li:dataProcessInstance:5e274228107f44cc2dd7c9782168cc29" - ) - - assert ( - mock_emitter.method_calls[10].args[0].aspectName - == "dataProcessInstanceRelationships" - ) - assert ( - mock_emitter.method_calls[10].args[0].entityUrn - == "urn:li:dataProcessInstance:5e274228107f44cc2dd7c9782168cc29" - ) - assert ( - mock_emitter.method_calls[11].args[0].aspectName - == "dataProcessInstanceInput" - ) - assert ( - mock_emitter.method_calls[11].args[0].entityUrn - == "urn:li:dataProcessInstance:5e274228107f44cc2dd7c9782168cc29" - ) - assert ( - mock_emitter.method_calls[12].args[0].aspectName - == "dataProcessInstanceOutput" - ) - assert ( - mock_emitter.method_calls[12].args[0].entityUrn - == "urn:li:dataProcessInstance:5e274228107f44cc2dd7c9782168cc29" - ) - assert mock_emitter.method_calls[13].args[0].aspectName == "status" - assert ( - mock_emitter.method_calls[13].args[0].entityUrn - == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" - ) - assert mock_emitter.method_calls[14].args[0].aspectName == "status" - assert ( - mock_emitter.method_calls[14].args[0].entityUrn - == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" - ) - assert ( - mock_emitter.method_calls[15].args[0].aspectName - == "dataProcessInstanceRunEvent" - ) - assert ( - mock_emitter.method_calls[15].args[0].entityUrn - == "urn:li:dataProcessInstance:5e274228107f44cc2dd7c9782168cc29" - ) - assert ( - mock_emitter.method_calls[16].args[0].aspectName - == "dataProcessInstanceRunEvent" - ) - assert ( - mock_emitter.method_calls[16].args[0].entityUrn - == "urn:li:dataProcessInstance:5e274228107f44cc2dd7c9782168cc29" - ) diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index b3cc350cc109fa..8338124288ec99 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -17,7 +17,7 @@ def get_coverage_arg(test_name) { task checkPythonVersion(type: Exec) { commandLine python_executable, '-c', - 'import sys; assert (3, 11) > sys.version_info >= (3, 7), f"Python version {sys.version_info[:2]} not allowed"' + 'import sys; assert (3, 11) > sys.version_info >= (3, 8), f"Python version {sys.version_info[:2]} not allowed"' } task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { diff --git a/metadata-ingestion/cli-ingestion.md b/metadata-ingestion/cli-ingestion.md index cbdde2cd301678..48cc4ef09db910 100644 --- a/metadata-ingestion/cli-ingestion.md +++ b/metadata-ingestion/cli-ingestion.md @@ -2,26 +2,31 @@ ## Installing the CLI -Make sure you have installed DataHub CLI before following this guide. +Make sure you have installed DataHub CLI before following this guide. + ```shell -# Requires Python 3.7+ +# Requires Python 3.8+ python3 -m pip install --upgrade pip wheel setuptools python3 -m pip install --upgrade acryl-datahub # validate that the install was successful datahub version # If you see "command not found", try running this instead: python3 -m datahub version ``` -Check out the [CLI Installation Guide](../docs/cli.md#installation) for more installation options and troubleshooting tips. + +Check out the [CLI Installation Guide](../docs/cli.md#installation) for more installation options and troubleshooting tips. After that, install the required plugin for the ingestion. ```shell pip install 'acryl-datahub[datahub-rest]' # install the required plugin ``` -Check out the [alternative installation options](../docs/cli.md#alternate-installation-options) for more reference. + +Check out the [alternative installation options](../docs/cli.md#alternate-installation-options) for more reference. ## Configuring a Recipe + Create a recipe.yml file that defines the source and sink for metadata, as shown below. + ```yaml # my_reipe.yml source: @@ -29,7 +34,7 @@ source: config: option_1: ... - + sink: type: config: @@ -39,7 +44,8 @@ sink: For more information and examples on configuring recipes, please refer to [Recipes](recipe_overview.md). ## Ingesting Metadata -You can run ingestion using `datahub ingest` like below. + +You can run ingestion using `datahub ingest` like below. ```shell datahub ingest -c @@ -48,6 +54,7 @@ datahub ingest -c ## Reference Please refer the following pages for advanced guids on CLI ingestion. + - [Reference for `datahub ingest` command](../docs/cli.md#ingest) - [UI Ingestion Guide](../docs/ui-ingestion.md) @@ -56,4 +63,4 @@ DataHub server uses a 3 digit versioning scheme, while the CLI uses a 4 digit sc We do this because we do CLI releases at a much higher frequency than server releases, usually every few days vs twice a month. For ingestion sources, any breaking changes will be highlighted in the [release notes](../docs/how/updating-datahub.md). When fields are deprecated or otherwise changed, we will try to maintain backwards compatibility for two server releases, which is about 4-6 weeks. The CLI will also print warnings whenever deprecated options are used. -::: \ No newline at end of file +::: diff --git a/metadata-ingestion/developing.md b/metadata-ingestion/developing.md index fc3a689124b2c1..47e325171ddcc6 100644 --- a/metadata-ingestion/developing.md +++ b/metadata-ingestion/developing.md @@ -9,10 +9,10 @@ Also take a look at the guide to [adding a source](./adding-source.md). ### Requirements -1. Python 3.7+ must be installed in your host environment. +1. Python 3.8+ must be installed in your host environment. 2. Java 17 (gradle won't work with newer or older versions) -4. On Debian/Ubuntu: `sudo apt install python3-dev python3-venv` -5. On Fedora (if using LDAP source integration): `sudo yum install openldap-devel` +3. On Debian/Ubuntu: `sudo apt install python3-dev python3-venv` +4. On Fedora (if using LDAP source integration): `sudo yum install openldap-devel` ### Set up your Python environment diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index af2b54ba1cefa5..f8d51997330a9d 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -1,4 +1,3 @@ -import sys from typing import Dict, Set import setuptools @@ -11,7 +10,6 @@ base_requirements = { # Typing extension should be >=3.10.0.2 ideally but we can't restrict due to a Airflow 2.1 dependency conflict. "typing_extensions>=3.7.4.3", - "mypy_extensions>=0.4.3", # Actual dependencies. "typing-inspect", # pydantic 1.8.2 is incompatible with mypy 0.910. @@ -48,9 +46,7 @@ "click-spinner", "requests_file", "jsonref", - # jsonschema drops python 3.7 support in v4.18.0 - "jsonschema<=4.17.3; python_version < '3.8'", - "jsonschema; python_version >= '3.8'", + "jsonschema", "ruamel.yaml", } @@ -463,7 +459,7 @@ "black==22.12.0", "coverage>=5.1", "faker>=18.4.0", - "flake8>=3.8.3", # DEPRECATION: Once we drop Python 3.7, we can pin to 6.x. + "flake8>=6.0.0", "flake8-tidy-imports>=4.3.0", "flake8-bugbear==23.3.12", "isort>=5.7.0", @@ -489,9 +485,9 @@ "delta-lake", "druid", "elasticsearch", - "feast" if sys.version_info >= (3, 8) else None, - "iceberg" if sys.version_info >= (3, 8) else None, - "mlflow" if sys.version_info >= (3, 8) else None, + "feast", + "iceberg", + "mlflow", "json-schema", "ldap", "looker", @@ -544,14 +540,14 @@ "clickhouse", "delta-lake", "druid", - "feast" if sys.version_info >= (3, 8) else None, + "feast", "hana", "hive", - "iceberg" if sys.version_info >= (3, 8) else None, + "iceberg", "kafka-connect", "ldap", "mongodb", - "mssql" if sys.version_info >= (3, 8) else None, + "mssql", "mysql", "mariadb", "redash", @@ -699,7 +695,6 @@ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", @@ -716,7 +711,7 @@ ], # Package info. zip_safe=False, - python_requires=">=3.7", + python_requires=">=3.8", package_dir={"": "src"}, packages=setuptools.find_namespace_packages(where="./src"), package_data={ diff --git a/metadata-ingestion/src/datahub/__init__.py b/metadata-ingestion/src/datahub/__init__.py index a470de7b500be3..b254deb7fa30e5 100644 --- a/metadata-ingestion/src/datahub/__init__.py +++ b/metadata-ingestion/src/datahub/__init__.py @@ -16,16 +16,9 @@ def nice_version_name() -> str: return __version__ -if sys.version_info < (3, 7): +if sys.version_info < (3, 8): warnings.warn( - "DataHub requires Python 3.7 or newer. " - "Please upgrade your Python version to continue using DataHub.", - FutureWarning, - stacklevel=2, - ) -elif sys.version_info < (3, 8): - warnings.warn( - "DataHub will require Python 3.8 or newer soon. " + "DataHub requires Python 3.8 or newer. " "Please upgrade your Python version to continue using DataHub.", FutureWarning, stacklevel=2, diff --git a/metadata-ingestion/src/datahub/ingestion/api/report.py b/metadata-ingestion/src/datahub/ingestion/api/report.py index fcca7675917746..08b20d9e856911 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/report.py +++ b/metadata-ingestion/src/datahub/ingestion/api/report.py @@ -2,11 +2,10 @@ import json import logging import pprint -import sys from dataclasses import dataclass from datetime import datetime, timedelta from enum import Enum -from typing import Any, Dict, Optional +from typing import Any, Optional import humanfriendly import pydantic @@ -19,12 +18,6 @@ logger = logging.getLogger(__name__) LogLevel = Literal["ERROR", "WARNING", "INFO", "DEBUG"] -# The sort_dicts option was added in Python 3.8. -if sys.version_info >= (3, 8): - PPRINT_OPTIONS = {"sort_dicts": False} -else: - PPRINT_OPTIONS: Dict = {} - @runtime_checkable class SupportsAsObj(Protocol): @@ -32,14 +25,6 @@ def as_obj(self) -> dict: ... -def _stacklevel_if_supported(level: int) -> dict: - # The logging module added support for stacklevel in Python 3.8. - if sys.version_info >= (3, 8): - return {"stacklevel": level} - else: - return {} - - @dataclass class Report(SupportsAsObj): @staticmethod @@ -95,7 +80,7 @@ def as_obj(self) -> dict: } def as_string(self) -> str: - return pprint.pformat(self.as_obj(), width=150, **PPRINT_OPTIONS) + return pprint.pformat(self.as_obj(), width=150, sort_dicts=False) def as_json(self) -> str: return json.dumps(self.as_obj()) @@ -118,7 +103,7 @@ def logger_sev(self) -> int: return log_levels[self.severity] def log(self, msg: str) -> None: - logger.log(level=self.logger_sev, msg=msg, **_stacklevel_if_supported(3)) + logger.log(level=self.logger_sev, msg=msg, stacklevel=3) class EntityFilterReport(ReportAttribute): diff --git a/metadata-ingestion/src/datahub/ingestion/source/feast.py b/metadata-ingestion/src/datahub/ingestion/source/feast.py index 8faba7d1133729..db0c8e9c39e7bf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/feast.py +++ b/metadata-ingestion/src/datahub/ingestion/source/feast.py @@ -1,8 +1,3 @@ -import sys - -if sys.version_info < (3, 8): - raise ImportError("Feast is only supported on Python 3.8+") - from dataclasses import dataclass from typing import Dict, Iterable, List, Optional, Tuple, Union diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index cc7f646dcb884c..2585260434a384 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -1,8 +1,3 @@ -import sys - -if sys.version_info < (3, 8): - raise ImportError("Iceberg is only supported on Python 3.8+") - import json import logging import uuid diff --git a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py index 0668defe7b0c6c..cef6d2b1bb5774 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py @@ -1,9 +1,3 @@ -import sys - -if sys.version_info < (3, 8): - raise ImportError("MLflow is only supported on Python 3.8+") - - from dataclasses import dataclass from typing import Any, Callable, Iterable, Optional, TypeVar, Union diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/object.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/object.py index b58bdf41ccaa5a..5a11d020547e8e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/object.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/object.py @@ -1,7 +1,7 @@ from collections import Counter from typing import Any, Counter as CounterType, Dict, Sequence, Tuple, Union -from mypy_extensions import TypedDict +from typing_extensions import TypedDict class BasicSchemaDescription(TypedDict): diff --git a/metadata-ingestion/tests/integration/feast/test_feast_repository.py b/metadata-ingestion/tests/integration/feast/test_feast_repository.py index eab37f67ed155d..a6bdce67222896 100644 --- a/metadata-ingestion/tests/integration/feast/test_feast_repository.py +++ b/metadata-ingestion/tests/integration/feast/test_feast_repository.py @@ -1,6 +1,3 @@ -import sys - -import pytest from freezegun import freeze_time from datahub.ingestion.run.pipeline import Pipeline @@ -8,10 +5,6 @@ FROZEN_TIME = "2020-04-14 07:00:00" -pytestmark = pytest.mark.skipif( - sys.version_info < (3, 8), reason="requires python 3.8 or higher" -) - @freeze_time(FROZEN_TIME) def test_feast_repository_ingest(pytestconfig, tmp_path, mock_time): diff --git a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py index 65ede11c3f1c01..a9ab43169405de 100644 --- a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py +++ b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py @@ -1,5 +1,4 @@ import subprocess -import sys from typing import Any, Dict, List from unittest.mock import patch @@ -15,13 +14,7 @@ validate_all_providers_have_committed_successfully, ) -pytestmark = [ - pytest.mark.integration_batch_1, - # Skip tests if not on Python 3.8 or higher. - pytest.mark.skipif( - sys.version_info < (3, 8), reason="Requires python 3.8 or higher" - ), -] +pytestmark = pytest.mark.integration_batch_1 FROZEN_TIME = "2020-04-14 07:00:00" GMS_PORT = 8080 GMS_SERVER = f"http://localhost:{GMS_PORT}" diff --git a/metadata-ingestion/tests/integration/mlflow/test_mlflow_source.py b/metadata-ingestion/tests/integration/mlflow/test_mlflow_source.py index 76af666526555a..155199d5a04e97 100644 --- a/metadata-ingestion/tests/integration/mlflow/test_mlflow_source.py +++ b/metadata-ingestion/tests/integration/mlflow/test_mlflow_source.py @@ -1,104 +1,106 @@ -import sys +from pathlib import Path +from typing import Any, Dict, TypeVar -if sys.version_info >= (3, 8): - from pathlib import Path - from typing import Any, Dict, TypeVar +import pytest +from mlflow import MlflowClient - import pytest - from mlflow import MlflowClient +from datahub.ingestion.run.pipeline import Pipeline +from tests.test_helpers import mce_helpers - from datahub.ingestion.run.pipeline import Pipeline - from tests.test_helpers import mce_helpers +T = TypeVar("T") - T = TypeVar("T") - @pytest.fixture - def tracking_uri(tmp_path: Path) -> str: - return str(tmp_path / "mlruns") +@pytest.fixture +def tracking_uri(tmp_path: Path) -> str: + return str(tmp_path / "mlruns") - @pytest.fixture - def sink_file_path(tmp_path: Path) -> str: - return str(tmp_path / "mlflow_source_mcps.json") - @pytest.fixture - def pipeline_config(tracking_uri: str, sink_file_path: str) -> Dict[str, Any]: - source_type = "mlflow" - return { - "run_id": "mlflow-source-test", - "source": { - "type": source_type, - "config": { - "tracking_uri": tracking_uri, - }, +@pytest.fixture +def sink_file_path(tmp_path: Path) -> str: + return str(tmp_path / "mlflow_source_mcps.json") + + +@pytest.fixture +def pipeline_config(tracking_uri: str, sink_file_path: str) -> Dict[str, Any]: + source_type = "mlflow" + return { + "run_id": "mlflow-source-test", + "source": { + "type": source_type, + "config": { + "tracking_uri": tracking_uri, }, - "sink": { - "type": "file", - "config": { - "filename": sink_file_path, - }, + }, + "sink": { + "type": "file", + "config": { + "filename": sink_file_path, }, - } + }, + } + + +@pytest.fixture +def generate_mlflow_data(tracking_uri: str) -> None: + client = MlflowClient(tracking_uri=tracking_uri) + experiment_name = "test-experiment" + run_name = "test-run" + model_name = "test-model" + test_experiment_id = client.create_experiment(experiment_name) + test_run = client.create_run( + experiment_id=test_experiment_id, + run_name=run_name, + ) + client.log_param( + run_id=test_run.info.run_id, + key="p", + value=1, + ) + client.log_metric( + run_id=test_run.info.run_id, + key="m", + value=0.85, + ) + client.create_registered_model( + name=model_name, + tags=dict( + model_id=1, + model_env="test", + ), + description="This a test registered model", + ) + client.create_model_version( + name=model_name, + source="dummy_dir/dummy_file", + run_id=test_run.info.run_id, + tags=dict(model_version_id=1), + ) + client.transition_model_version_stage( + name=model_name, + version="1", + stage="Archived", + ) - @pytest.fixture - def generate_mlflow_data(tracking_uri: str) -> None: - client = MlflowClient(tracking_uri=tracking_uri) - experiment_name = "test-experiment" - run_name = "test-run" - model_name = "test-model" - test_experiment_id = client.create_experiment(experiment_name) - test_run = client.create_run( - experiment_id=test_experiment_id, - run_name=run_name, - ) - client.log_param( - run_id=test_run.info.run_id, - key="p", - value=1, - ) - client.log_metric( - run_id=test_run.info.run_id, - key="m", - value=0.85, - ) - client.create_registered_model( - name=model_name, - tags=dict( - model_id=1, - model_env="test", - ), - description="This a test registered model", - ) - client.create_model_version( - name=model_name, - source="dummy_dir/dummy_file", - run_id=test_run.info.run_id, - tags=dict(model_version_id=1), - ) - client.transition_model_version_stage( - name=model_name, - version="1", - stage="Archived", - ) - def test_ingestion( - pytestconfig, - mock_time, - sink_file_path, - pipeline_config, - generate_mlflow_data, - ): - print(f"MCPs file path: {sink_file_path}") - golden_file_path = ( - pytestconfig.rootpath / "tests/integration/mlflow/mlflow_mcps_golden.json" - ) +def test_ingestion( + pytestconfig, + mock_time, + sink_file_path, + pipeline_config, + generate_mlflow_data, +): + print(f"MCPs file path: {sink_file_path}") + golden_file_path = ( + pytestconfig.rootpath / "tests/integration/mlflow/mlflow_mcps_golden.json" + ) - pipeline = Pipeline.create(pipeline_config) - pipeline.run() - pipeline.pretty_print_summary() - pipeline.raise_from_status() + pipeline = Pipeline.create(pipeline_config) + pipeline.run() + pipeline.pretty_print_summary() + pipeline.raise_from_status() - mce_helpers.check_golden_file( - pytestconfig=pytestconfig, - output_path=sink_file_path, - golden_path=golden_file_path, - ) + mce_helpers.check_golden_file( + pytestconfig=pytestconfig, + output_path=sink_file_path, + golden_path=golden_file_path, + ) diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py index 5ed672d527264a..f439a322c26771 100644 --- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py +++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py @@ -1,6 +1,5 @@ import os import subprocess -import sys import time import pytest @@ -9,10 +8,6 @@ from tests.test_helpers.click_helpers import run_datahub_cmd from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port -pytestmark = pytest.mark.skipif( - sys.version_info < (3, 8), reason="requires python 3.8 or higher" -) - @pytest.fixture(scope="module") def mssql_runner(docker_compose_runner, pytestconfig): diff --git a/metadata-ingestion/tests/unit/test_iceberg.py b/metadata-ingestion/tests/unit/test_iceberg.py index 768d4f958af1fb..e2b463004f5a13 100644 --- a/metadata-ingestion/tests/unit/test_iceberg.py +++ b/metadata-ingestion/tests/unit/test_iceberg.py @@ -1,482 +1,477 @@ -import sys import uuid from decimal import Decimal from typing import Any, Optional import pytest from pydantic import ValidationError +from pyiceberg.schema import Schema +from pyiceberg.types import ( + BinaryType, + BooleanType, + DateType, + DecimalType, + DoubleType, + FixedType, + FloatType, + IcebergType, + IntegerType, + ListType, + LongType, + MapType, + NestedField, + PrimitiveType, + StringType, + StructType, + TimestampType, + TimestamptzType, + TimeType, + UUIDType, +) + +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.iceberg.iceberg import ( + IcebergProfiler, + IcebergSource, + IcebergSourceConfig, +) +from datahub.ingestion.source.iceberg.iceberg_common import IcebergCatalogConfig +from datahub.metadata.com.linkedin.pegasus2avro.schema import ArrayType, SchemaField +from datahub.metadata.schema_classes import ( + ArrayTypeClass, + BooleanTypeClass, + BytesTypeClass, + DateTypeClass, + FixedTypeClass, + NumberTypeClass, + RecordTypeClass, + StringTypeClass, + TimeTypeClass, +) -if sys.version_info >= (3, 8): - from pyiceberg.schema import Schema - from pyiceberg.types import ( - BinaryType, - BooleanType, - DateType, - DecimalType, - DoubleType, - FixedType, - FloatType, - IcebergType, - IntegerType, - ListType, - LongType, - MapType, - NestedField, - PrimitiveType, - StringType, - StructType, - TimestampType, - TimestamptzType, - TimeType, - UUIDType, - ) - from datahub.ingestion.api.common import PipelineContext - from datahub.ingestion.source.iceberg.iceberg import ( - IcebergProfiler, - IcebergSource, - IcebergSourceConfig, +def with_iceberg_source() -> IcebergSource: + catalog: IcebergCatalogConfig = IcebergCatalogConfig( + name="test", type="rest", config={} ) - from datahub.ingestion.source.iceberg.iceberg_common import IcebergCatalogConfig - from datahub.metadata.com.linkedin.pegasus2avro.schema import ArrayType, SchemaField - from datahub.metadata.schema_classes import ( - ArrayTypeClass, - BooleanTypeClass, - BytesTypeClass, - DateTypeClass, - FixedTypeClass, - NumberTypeClass, - RecordTypeClass, - StringTypeClass, - TimeTypeClass, + return IcebergSource( + ctx=PipelineContext(run_id="iceberg-source-test"), + config=IcebergSourceConfig(catalog=catalog), ) - pytestmark = pytest.mark.skipif( - sys.version_info < (3, 8), reason="requires python 3.8 or higher" + +def with_iceberg_profiler() -> IcebergProfiler: + iceberg_source_instance = with_iceberg_source() + return IcebergProfiler( + iceberg_source_instance.report, iceberg_source_instance.config.profiling ) - def with_iceberg_source() -> IcebergSource: - catalog: IcebergCatalogConfig = IcebergCatalogConfig( - name="test", type="rest", config={} - ) - return IcebergSource( - ctx=PipelineContext(run_id="iceberg-source-test"), - config=IcebergSourceConfig(catalog=catalog), - ) - def with_iceberg_profiler() -> IcebergProfiler: - iceberg_source_instance = with_iceberg_source() - return IcebergProfiler( - iceberg_source_instance.report, iceberg_source_instance.config.profiling - ) +def assert_field( + schema_field: SchemaField, + expected_description: Optional[str], + expected_nullable: bool, + expected_type: Any, +) -> None: + assert ( + schema_field.description == expected_description + ), f"Field description '{schema_field.description}' is different from expected description '{expected_description}'" + assert ( + schema_field.nullable == expected_nullable + ), f"Field nullable '{schema_field.nullable}' is different from expected nullable '{expected_nullable}'" + assert isinstance( + schema_field.type.type, expected_type + ), f"Field type {schema_field.type.type} is different from expected type {expected_type}" - def assert_field( - schema_field: SchemaField, - expected_description: Optional[str], - expected_nullable: bool, - expected_type: Any, - ) -> None: - assert ( - schema_field.description == expected_description - ), f"Field description '{schema_field.description}' is different from expected description '{expected_description}'" + +def test_config_no_catalog(): + """ + Test when no Iceberg catalog is provided. + """ + with pytest.raises(ValidationError, match="catalog"): + IcebergSourceConfig() # type: ignore + + +def test_config_catalog_not_configured(): + """ + Test when an Iceberg catalog is provided, but not properly configured. + """ + with pytest.raises(ValidationError): + IcebergCatalogConfig() # type: ignore + + with pytest.raises(ValidationError, match="conf"): + IcebergCatalogConfig(type="a type") # type: ignore + + with pytest.raises(ValidationError, match="type"): + IcebergCatalogConfig(conf={}) # type: ignore + + +def test_config_for_tests(): + """ + Test valid iceberg source that will be used in unit tests. + """ + with_iceberg_source() + + +@pytest.mark.parametrize( + "iceberg_type, expected_schema_field_type", + [ + (BinaryType(), BytesTypeClass), + (BooleanType(), BooleanTypeClass), + (DateType(), DateTypeClass), + ( + DecimalType(3, 2), + NumberTypeClass, + ), + (DoubleType(), NumberTypeClass), + (FixedType(4), FixedTypeClass), + (FloatType(), NumberTypeClass), + (IntegerType(), NumberTypeClass), + (LongType(), NumberTypeClass), + (StringType(), StringTypeClass), + ( + TimestampType(), + TimeTypeClass, + ), + ( + TimestamptzType(), + TimeTypeClass, + ), + (TimeType(), TimeTypeClass), + ( + UUIDType(), + StringTypeClass, + ), + ], +) +def test_iceberg_primitive_type_to_schema_field( + iceberg_type: PrimitiveType, expected_schema_field_type: Any +) -> None: + """ + Test converting a primitive typed Iceberg field to a SchemaField + """ + iceberg_source_instance = with_iceberg_source() + for column in [ + NestedField( + 1, "required_field", iceberg_type, True, "required field documentation" + ), + NestedField( + 1, "optional_field", iceberg_type, False, "optional field documentation" + ), + ]: + schema = Schema(column) + schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema) assert ( - schema_field.nullable == expected_nullable - ), f"Field nullable '{schema_field.nullable}' is different from expected nullable '{expected_nullable}'" - assert isinstance( - schema_field.type.type, expected_type - ), f"Field type {schema_field.type.type} is different from expected type {expected_type}" - - def test_config_no_catalog(): - """ - Test when no Iceberg catalog is provided. - """ - with pytest.raises(ValidationError, match="catalog"): - IcebergSourceConfig() # type: ignore - - def test_config_catalog_not_configured(): - """ - Test when an Iceberg catalog is provided, but not properly configured. - """ - with pytest.raises(ValidationError): - IcebergCatalogConfig() # type: ignore - - with pytest.raises(ValidationError, match="conf"): - IcebergCatalogConfig(type="a type") # type: ignore - - with pytest.raises(ValidationError, match="type"): - IcebergCatalogConfig(conf={}) # type: ignore - - def test_config_for_tests(): - """ - Test valid iceberg source that will be used in unit tests. - """ - with_iceberg_source() - - @pytest.mark.parametrize( - "iceberg_type, expected_schema_field_type", - [ - (BinaryType(), BytesTypeClass), - (BooleanType(), BooleanTypeClass), - (DateType(), DateTypeClass), - ( - DecimalType(3, 2), - NumberTypeClass, - ), - (DoubleType(), NumberTypeClass), - (FixedType(4), FixedTypeClass), - (FloatType(), NumberTypeClass), - (IntegerType(), NumberTypeClass), - (LongType(), NumberTypeClass), - (StringType(), StringTypeClass), - ( - TimestampType(), - TimeTypeClass, - ), - ( - TimestamptzType(), - TimeTypeClass, - ), - (TimeType(), TimeTypeClass), - ( - UUIDType(), - StringTypeClass, - ), - ], - ) - def test_iceberg_primitive_type_to_schema_field( - iceberg_type: PrimitiveType, expected_schema_field_type: Any - ) -> None: - """ - Test converting a primitive typed Iceberg field to a SchemaField - """ + len(schema_fields) == 1 + ), f"Expected 1 field, but got {len(schema_fields)}" + assert_field( + schema_fields[0], + column.doc, + column.optional, + expected_schema_field_type, + ) + + +@pytest.mark.parametrize( + "iceberg_type, expected_array_nested_type", + [ + (BinaryType(), "bytes"), + (BooleanType(), "boolean"), + (DateType(), "date"), + ( + DecimalType(3, 2), + "decimal", + ), + (DoubleType(), "double"), + (FixedType(4), "fixed"), + (FloatType(), "float"), + (IntegerType(), "int"), + (LongType(), "long"), + (StringType(), "string"), + ( + TimestampType(), + "timestamp-micros", + ), + ( + TimestamptzType(), + "timestamp-micros", + ), + (TimeType(), "time-micros"), + ( + UUIDType(), + "uuid", + ), + ], +) +def test_iceberg_list_to_schema_field( + iceberg_type: PrimitiveType, expected_array_nested_type: Any +) -> None: + """ + Test converting a list typed Iceberg field to an ArrayType SchemaField, including the list nested type. + """ + for list_column in [ + NestedField( + 1, + "listField", + ListType(2, iceberg_type, True), + True, + "required field, required element documentation", + ), + NestedField( + 1, + "listField", + ListType(2, iceberg_type, False), + True, + "required field, optional element documentation", + ), + NestedField( + 1, + "listField", + ListType(2, iceberg_type, True), + False, + "optional field, required element documentation", + ), + NestedField( + 1, + "listField", + ListType(2, iceberg_type, False), + False, + "optional field, optional element documentation", + ), + ]: iceberg_source_instance = with_iceberg_source() - for column in [ - NestedField( - 1, "required_field", iceberg_type, True, "required field documentation" - ), - NestedField( - 1, "optional_field", iceberg_type, False, "optional field documentation" - ), - ]: - schema = Schema(column) - schema_fields = iceberg_source_instance._get_schema_fields_for_schema( - schema - ) - assert ( - len(schema_fields) == 1 - ), f"Expected 1 field, but got {len(schema_fields)}" - assert_field( - schema_fields[0], - column.doc, - column.optional, - expected_schema_field_type, - ) - - @pytest.mark.parametrize( - "iceberg_type, expected_array_nested_type", - [ - (BinaryType(), "bytes"), - (BooleanType(), "boolean"), - (DateType(), "date"), - ( - DecimalType(3, 2), - "decimal", - ), - (DoubleType(), "double"), - (FixedType(4), "fixed"), - (FloatType(), "float"), - (IntegerType(), "int"), - (LongType(), "long"), - (StringType(), "string"), - ( - TimestampType(), - "timestamp-micros", - ), - ( - TimestamptzType(), - "timestamp-micros", - ), - (TimeType(), "time-micros"), - ( - UUIDType(), - "uuid", - ), - ], - ) - def test_iceberg_list_to_schema_field( - iceberg_type: PrimitiveType, expected_array_nested_type: Any - ) -> None: - """ - Test converting a list typed Iceberg field to an ArrayType SchemaField, including the list nested type. - """ - for list_column in [ - NestedField( - 1, - "listField", - ListType(2, iceberg_type, True), - True, - "required field, required element documentation", - ), - NestedField( - 1, - "listField", - ListType(2, iceberg_type, False), - True, - "required field, optional element documentation", - ), - NestedField( - 1, - "listField", - ListType(2, iceberg_type, True), - False, - "optional field, required element documentation", - ), - NestedField( - 1, - "listField", - ListType(2, iceberg_type, False), - False, - "optional field, optional element documentation", - ), - ]: - iceberg_source_instance = with_iceberg_source() - schema = Schema(list_column) - schema_fields = iceberg_source_instance._get_schema_fields_for_schema( - schema - ) - assert ( - len(schema_fields) == 1 - ), f"Expected 1 field, but got {len(schema_fields)}" - assert_field( - schema_fields[0], list_column.doc, list_column.optional, ArrayTypeClass - ) - assert isinstance( - schema_fields[0].type.type, ArrayType - ), f"Field type {schema_fields[0].type.type} was expected to be {ArrayType}" - arrayType: ArrayType = schema_fields[0].type.type - assert arrayType.nestedType == [ - expected_array_nested_type - ], f"List Field nested type {arrayType.nestedType} was expected to be {expected_array_nested_type}" - - @pytest.mark.parametrize( - "iceberg_type, expected_map_type", - [ - (BinaryType(), BytesTypeClass), - (BooleanType(), BooleanTypeClass), - (DateType(), DateTypeClass), - ( - DecimalType(3, 2), - NumberTypeClass, - ), - (DoubleType(), NumberTypeClass), - (FixedType(4), FixedTypeClass), - (FloatType(), NumberTypeClass), - (IntegerType(), NumberTypeClass), - (LongType(), NumberTypeClass), - (StringType(), StringTypeClass), - ( - TimestampType(), - TimeTypeClass, - ), - ( - TimestamptzType(), - TimeTypeClass, - ), - (TimeType(), TimeTypeClass), - ( - UUIDType(), - StringTypeClass, - ), - ], - ) - def test_iceberg_map_to_schema_field( - iceberg_type: PrimitiveType, expected_map_type: Any - ) -> None: - """ - Test converting a map typed Iceberg field to a MapType SchemaField, where the key is the same type as the value. - """ - for map_column in [ - NestedField( - 1, - "mapField", - MapType(11, iceberg_type, 12, iceberg_type, True), - True, - "required field, required value documentation", - ), - NestedField( - 1, - "mapField", - MapType(11, iceberg_type, 12, iceberg_type, False), - True, - "required field, optional value documentation", - ), - NestedField( - 1, - "mapField", - MapType(11, iceberg_type, 12, iceberg_type, True), - False, - "optional field, required value documentation", - ), - NestedField( - 1, - "mapField", - MapType(11, iceberg_type, 12, iceberg_type, False), - False, - "optional field, optional value documentation", - ), - ]: - iceberg_source_instance = with_iceberg_source() - schema = Schema(map_column) - schema_fields = iceberg_source_instance._get_schema_fields_for_schema( - schema - ) - # Converting an Iceberg Map type will be done by creating an array of struct(key, value) records. - # The first field will be the array. - assert ( - len(schema_fields) == 3 - ), f"Expected 3 fields, but got {len(schema_fields)}" - assert_field( - schema_fields[0], map_column.doc, map_column.optional, ArrayTypeClass - ) - - # The second field will be the key type - assert_field(schema_fields[1], None, False, expected_map_type) - - # The third field will be the value type - assert_field( - schema_fields[2], - None, - not map_column.field_type.value_required, - expected_map_type, - ) - - @pytest.mark.parametrize( - "iceberg_type, expected_schema_field_type", - [ - (BinaryType(), BytesTypeClass), - (BooleanType(), BooleanTypeClass), - (DateType(), DateTypeClass), - ( - DecimalType(3, 2), - NumberTypeClass, - ), - (DoubleType(), NumberTypeClass), - (FixedType(4), FixedTypeClass), - (FloatType(), NumberTypeClass), - (IntegerType(), NumberTypeClass), - (LongType(), NumberTypeClass), - (StringType(), StringTypeClass), - ( - TimestampType(), - TimeTypeClass, - ), - ( - TimestamptzType(), - TimeTypeClass, - ), - (TimeType(), TimeTypeClass), - ( - UUIDType(), - StringTypeClass, - ), - ], - ) - def test_iceberg_struct_to_schema_field( - iceberg_type: PrimitiveType, expected_schema_field_type: Any - ) -> None: - """ - Test converting a struct typed Iceberg field to a RecordType SchemaField. - """ - field1 = NestedField(11, "field1", iceberg_type, True, "field documentation") - struct_column = NestedField( - 1, "structField", StructType(field1), True, "struct documentation" + schema = Schema(list_column) + schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema) + assert ( + len(schema_fields) == 1 + ), f"Expected 1 field, but got {len(schema_fields)}" + assert_field( + schema_fields[0], list_column.doc, list_column.optional, ArrayTypeClass ) + assert isinstance( + schema_fields[0].type.type, ArrayType + ), f"Field type {schema_fields[0].type.type} was expected to be {ArrayType}" + arrayType: ArrayType = schema_fields[0].type.type + assert arrayType.nestedType == [ + expected_array_nested_type + ], f"List Field nested type {arrayType.nestedType} was expected to be {expected_array_nested_type}" + + +@pytest.mark.parametrize( + "iceberg_type, expected_map_type", + [ + (BinaryType(), BytesTypeClass), + (BooleanType(), BooleanTypeClass), + (DateType(), DateTypeClass), + ( + DecimalType(3, 2), + NumberTypeClass, + ), + (DoubleType(), NumberTypeClass), + (FixedType(4), FixedTypeClass), + (FloatType(), NumberTypeClass), + (IntegerType(), NumberTypeClass), + (LongType(), NumberTypeClass), + (StringType(), StringTypeClass), + ( + TimestampType(), + TimeTypeClass, + ), + ( + TimestamptzType(), + TimeTypeClass, + ), + (TimeType(), TimeTypeClass), + ( + UUIDType(), + StringTypeClass, + ), + ], +) +def test_iceberg_map_to_schema_field( + iceberg_type: PrimitiveType, expected_map_type: Any +) -> None: + """ + Test converting a map typed Iceberg field to a MapType SchemaField, where the key is the same type as the value. + """ + for map_column in [ + NestedField( + 1, + "mapField", + MapType(11, iceberg_type, 12, iceberg_type, True), + True, + "required field, required value documentation", + ), + NestedField( + 1, + "mapField", + MapType(11, iceberg_type, 12, iceberg_type, False), + True, + "required field, optional value documentation", + ), + NestedField( + 1, + "mapField", + MapType(11, iceberg_type, 12, iceberg_type, True), + False, + "optional field, required value documentation", + ), + NestedField( + 1, + "mapField", + MapType(11, iceberg_type, 12, iceberg_type, False), + False, + "optional field, optional value documentation", + ), + ]: iceberg_source_instance = with_iceberg_source() - schema = Schema(struct_column) + schema = Schema(map_column) schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema) + # Converting an Iceberg Map type will be done by creating an array of struct(key, value) records. + # The first field will be the array. assert ( - len(schema_fields) == 2 - ), f"Expected 2 fields, but got {len(schema_fields)}" + len(schema_fields) == 3 + ), f"Expected 3 fields, but got {len(schema_fields)}" assert_field( - schema_fields[0], struct_column.doc, struct_column.optional, RecordTypeClass + schema_fields[0], map_column.doc, map_column.optional, ArrayTypeClass ) + + # The second field will be the key type + assert_field(schema_fields[1], None, False, expected_map_type) + + # The third field will be the value type assert_field( - schema_fields[1], field1.doc, field1.optional, expected_schema_field_type + schema_fields[2], + None, + not map_column.field_type.value_required, + expected_map_type, ) - @pytest.mark.parametrize( - "value_type, value, expected_value", - [ - (BinaryType(), bytes([1, 2, 3, 4, 5]), "b'\\x01\\x02\\x03\\x04\\x05'"), - (BooleanType(), True, "True"), - (DateType(), 19543, "2023-07-05"), - (DecimalType(3, 2), Decimal((0, (3, 1, 4), -2)), "3.14"), - (DoubleType(), 3.4, "3.4"), - (FixedType(4), bytes([1, 2, 3, 4]), "b'\\x01\\x02\\x03\\x04'"), - (FloatType(), 3.4, "3.4"), - (IntegerType(), 3, "3"), - (LongType(), 4294967295000, "4294967295000"), - (StringType(), "a string", "a string"), - ( - TimestampType(), - 1688559488157000, - "2023-07-05T12:18:08.157000", - ), - ( - TimestamptzType(), - 1688559488157000, - "2023-07-05T12:18:08.157000+00:00", - ), - (TimeType(), 40400000000, "11:13:20"), - ( - UUIDType(), - uuid.UUID("00010203-0405-0607-0809-0a0b0c0d0e0f"), - "00010203-0405-0607-0809-0a0b0c0d0e0f", - ), - ], + +@pytest.mark.parametrize( + "iceberg_type, expected_schema_field_type", + [ + (BinaryType(), BytesTypeClass), + (BooleanType(), BooleanTypeClass), + (DateType(), DateTypeClass), + ( + DecimalType(3, 2), + NumberTypeClass, + ), + (DoubleType(), NumberTypeClass), + (FixedType(4), FixedTypeClass), + (FloatType(), NumberTypeClass), + (IntegerType(), NumberTypeClass), + (LongType(), NumberTypeClass), + (StringType(), StringTypeClass), + ( + TimestampType(), + TimeTypeClass, + ), + ( + TimestamptzType(), + TimeTypeClass, + ), + (TimeType(), TimeTypeClass), + ( + UUIDType(), + StringTypeClass, + ), + ], +) +def test_iceberg_struct_to_schema_field( + iceberg_type: PrimitiveType, expected_schema_field_type: Any +) -> None: + """ + Test converting a struct typed Iceberg field to a RecordType SchemaField. + """ + field1 = NestedField(11, "field1", iceberg_type, True, "field documentation") + struct_column = NestedField( + 1, "structField", StructType(field1), True, "struct documentation" + ) + iceberg_source_instance = with_iceberg_source() + schema = Schema(struct_column) + schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema) + assert len(schema_fields) == 2, f"Expected 2 fields, but got {len(schema_fields)}" + assert_field( + schema_fields[0], struct_column.doc, struct_column.optional, RecordTypeClass + ) + assert_field( + schema_fields[1], field1.doc, field1.optional, expected_schema_field_type ) - def test_iceberg_profiler_value_render( - value_type: IcebergType, value: Any, expected_value: Optional[str] - ) -> None: - iceberg_profiler_instance = with_iceberg_profiler() - assert ( - iceberg_profiler_instance._render_value("a.dataset", value_type, value) - == expected_value - ) - def test_avro_decimal_bytes_nullable() -> None: - """ - The following test exposes a problem with decimal (bytes) not preserving extra attributes like _nullable. Decimal (fixed) and Boolean for example do. - NOTE: This bug was by-passed by mapping the Decimal type to fixed instead of bytes. - """ - import avro.schema - - decimal_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "bytes", "precision": 3, "scale": 2, "logicalType": "decimal", "native_data_type": "decimal(3, 2)", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}""" - decimal_avro_schema = avro.schema.parse(decimal_avro_schema_string) - print("\nDecimal (bytes)") - print( - f"Original avro schema string: {decimal_avro_schema_string}" - ) - print( - f"After avro parsing, _nullable attribute is missing: {decimal_avro_schema}" - ) - decimal_fixed_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "fixed", "logicalType": "decimal", "precision": 3, "scale": 2, "native_data_type": "decimal(3, 2)", "_nullable": false, "name": "bogusName", "size": 16}, "name": "required_field", "doc": "required field documentation"}]}""" - decimal_fixed_avro_schema = avro.schema.parse(decimal_fixed_avro_schema_string) - print("\nDecimal (fixed)") - print( - f"Original avro schema string: {decimal_fixed_avro_schema_string}" - ) - print( - f"After avro parsing, _nullable attribute is preserved: {decimal_fixed_avro_schema}" - ) +@pytest.mark.parametrize( + "value_type, value, expected_value", + [ + (BinaryType(), bytes([1, 2, 3, 4, 5]), "b'\\x01\\x02\\x03\\x04\\x05'"), + (BooleanType(), True, "True"), + (DateType(), 19543, "2023-07-05"), + (DecimalType(3, 2), Decimal((0, (3, 1, 4), -2)), "3.14"), + (DoubleType(), 3.4, "3.4"), + (FixedType(4), bytes([1, 2, 3, 4]), "b'\\x01\\x02\\x03\\x04'"), + (FloatType(), 3.4, "3.4"), + (IntegerType(), 3, "3"), + (LongType(), 4294967295000, "4294967295000"), + (StringType(), "a string", "a string"), + ( + TimestampType(), + 1688559488157000, + "2023-07-05T12:18:08.157000", + ), + ( + TimestamptzType(), + 1688559488157000, + "2023-07-05T12:18:08.157000+00:00", + ), + (TimeType(), 40400000000, "11:13:20"), + ( + UUIDType(), + uuid.UUID("00010203-0405-0607-0809-0a0b0c0d0e0f"), + "00010203-0405-0607-0809-0a0b0c0d0e0f", + ), + ], +) +def test_iceberg_profiler_value_render( + value_type: IcebergType, value: Any, expected_value: Optional[str] +) -> None: + iceberg_profiler_instance = with_iceberg_profiler() + assert ( + iceberg_profiler_instance._render_value("a.dataset", value_type, value) + == expected_value + ) - boolean_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "boolean", "native_data_type": "boolean", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}""" - boolean_avro_schema = avro.schema.parse(boolean_avro_schema_string) - print("\nBoolean") - print( - f"Original avro schema string: {boolean_avro_schema_string}" - ) - print( - f"After avro parsing, _nullable attribute is preserved: {boolean_avro_schema}" - ) + +def test_avro_decimal_bytes_nullable() -> None: + """ + The following test exposes a problem with decimal (bytes) not preserving extra attributes like _nullable. Decimal (fixed) and Boolean for example do. + NOTE: This bug was by-passed by mapping the Decimal type to fixed instead of bytes. + """ + import avro.schema + + decimal_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "bytes", "precision": 3, "scale": 2, "logicalType": "decimal", "native_data_type": "decimal(3, 2)", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}""" + decimal_avro_schema = avro.schema.parse(decimal_avro_schema_string) + print("\nDecimal (bytes)") + print( + f"Original avro schema string: {decimal_avro_schema_string}" + ) + print(f"After avro parsing, _nullable attribute is missing: {decimal_avro_schema}") + + decimal_fixed_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "fixed", "logicalType": "decimal", "precision": 3, "scale": 2, "native_data_type": "decimal(3, 2)", "_nullable": false, "name": "bogusName", "size": 16}, "name": "required_field", "doc": "required field documentation"}]}""" + decimal_fixed_avro_schema = avro.schema.parse(decimal_fixed_avro_schema_string) + print("\nDecimal (fixed)") + print( + f"Original avro schema string: {decimal_fixed_avro_schema_string}" + ) + print( + f"After avro parsing, _nullable attribute is preserved: {decimal_fixed_avro_schema}" + ) + + boolean_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "boolean", "native_data_type": "boolean", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}""" + boolean_avro_schema = avro.schema.parse(boolean_avro_schema_string) + print("\nBoolean") + print( + f"Original avro schema string: {boolean_avro_schema_string}" + ) + print( + f"After avro parsing, _nullable attribute is preserved: {boolean_avro_schema}" + ) diff --git a/metadata-ingestion/tests/unit/test_mlflow_source.py b/metadata-ingestion/tests/unit/test_mlflow_source.py index 97b5afd3d6a4ef..374816055b2164 100644 --- a/metadata-ingestion/tests/unit/test_mlflow_source.py +++ b/metadata-ingestion/tests/unit/test_mlflow_source.py @@ -1,133 +1,140 @@ -import sys +import datetime +from pathlib import Path +from typing import Any, TypeVar, Union + +import pytest +from mlflow import MlflowClient +from mlflow.entities.model_registry import RegisteredModel +from mlflow.entities.model_registry.model_version import ModelVersion +from mlflow.store.entities import PagedList + +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.mlflow import MLflowConfig, MLflowSource + +T = TypeVar("T") + + +@pytest.fixture +def tracking_uri(tmp_path: Path) -> str: + return str(tmp_path / "mlruns") + + +@pytest.fixture +def source(tracking_uri: str) -> MLflowSource: + return MLflowSource( + ctx=PipelineContext(run_id="mlflow-source-test"), + config=MLflowConfig(tracking_uri=tracking_uri), + ) + + +@pytest.fixture +def registered_model(source: MLflowSource) -> RegisteredModel: + model_name = "abc" + return RegisteredModel(name=model_name) + + +@pytest.fixture +def model_version( + source: MLflowSource, + registered_model: RegisteredModel, +) -> ModelVersion: + version = "1" + return ModelVersion( + name=registered_model.name, + version=version, + creation_timestamp=datetime.datetime.now(), + ) + + +def dummy_search_func(page_token: Union[None, str], **kwargs: Any) -> PagedList[T]: + dummy_pages = dict( + page_1=PagedList(items=["a", "b"], token="page_2"), + page_2=PagedList(items=["c", "d"], token="page_3"), + page_3=PagedList(items=["e"], token=None), + ) + if page_token is None: + page_to_return = dummy_pages["page_1"] + else: + page_to_return = dummy_pages[page_token] + if kwargs.get("case", "") == "upper": + page_to_return = PagedList( + items=[e.upper() for e in page_to_return.to_list()], + token=page_to_return.token, + ) + return page_to_return -if sys.version_info >= (3, 8): - import datetime - from pathlib import Path - from typing import Any, TypeVar, Union - import pytest - from mlflow import MlflowClient - from mlflow.entities.model_registry import RegisteredModel - from mlflow.entities.model_registry.model_version import ModelVersion - from mlflow.store.entities import PagedList +def test_stages(source): + mlflow_registered_model_stages = { + "Production", + "Staging", + "Archived", + None, + } + workunits = source._get_tags_workunits() + names = [wu.get_metadata()["metadata"].aspect.name for wu in workunits] - from datahub.ingestion.api.common import PipelineContext - from datahub.ingestion.source.mlflow import MLflowConfig, MLflowSource + assert len(names) == len(mlflow_registered_model_stages) + assert set(names) == { + "mlflow_" + str(stage).lower() for stage in mlflow_registered_model_stages + } - T = TypeVar("T") - @pytest.fixture - def tracking_uri(tmp_path: Path) -> str: - return str(tmp_path / "mlruns") +def test_config_model_name_separator(source, model_version): + name_version_sep = "+" + source.config.model_name_separator = name_version_sep + expected_model_name = ( + f"{model_version.name}{name_version_sep}{model_version.version}" + ) + expected_urn = f"urn:li:mlModel:(urn:li:dataPlatform:mlflow,{expected_model_name},{source.config.env})" - @pytest.fixture - def source(tracking_uri: str) -> MLflowSource: - return MLflowSource( - ctx=PipelineContext(run_id="mlflow-source-test"), - config=MLflowConfig(tracking_uri=tracking_uri), - ) + urn = source._make_ml_model_urn(model_version) - @pytest.fixture - def registered_model(source: MLflowSource) -> RegisteredModel: - model_name = "abc" - return RegisteredModel(name=model_name) - - @pytest.fixture - def model_version( - source: MLflowSource, - registered_model: RegisteredModel, - ) -> ModelVersion: - version = "1" - return ModelVersion( - name=registered_model.name, - version=version, - creation_timestamp=datetime.datetime.now(), - ) + assert urn == expected_urn - def dummy_search_func(page_token: Union[None, str], **kwargs: Any) -> PagedList[T]: - dummy_pages = dict( - page_1=PagedList(items=["a", "b"], token="page_2"), - page_2=PagedList(items=["c", "d"], token="page_3"), - page_3=PagedList(items=["e"], token=None), - ) - if page_token is None: - page_to_return = dummy_pages["page_1"] - else: - page_to_return = dummy_pages[page_token] - if kwargs.get("case", "") == "upper": - page_to_return = PagedList( - items=[e.upper() for e in page_to_return.to_list()], - token=page_to_return.token, - ) - return page_to_return - - def test_stages(source): - mlflow_registered_model_stages = { - "Production", - "Staging", - "Archived", - None, - } - workunits = source._get_tags_workunits() - names = [wu.get_metadata()["metadata"].aspect.name for wu in workunits] - - assert len(names) == len(mlflow_registered_model_stages) - assert set(names) == { - "mlflow_" + str(stage).lower() for stage in mlflow_registered_model_stages - } - - def test_config_model_name_separator(source, model_version): - name_version_sep = "+" - source.config.model_name_separator = name_version_sep - expected_model_name = ( - f"{model_version.name}{name_version_sep}{model_version.version}" - ) - expected_urn = f"urn:li:mlModel:(urn:li:dataPlatform:mlflow,{expected_model_name},{source.config.env})" - urn = source._make_ml_model_urn(model_version) +def test_model_without_run(source, registered_model, model_version): + run = source._get_mlflow_run(model_version) + wu = source._get_ml_model_properties_workunit( + registered_model=registered_model, + model_version=model_version, + run=run, + ) + aspect = wu.get_metadata()["metadata"].aspect - assert urn == expected_urn + assert aspect.hyperParams is None + assert aspect.trainingMetrics is None - def test_model_without_run(source, registered_model, model_version): - run = source._get_mlflow_run(model_version) - wu = source._get_ml_model_properties_workunit( - registered_model=registered_model, - model_version=model_version, - run=run, - ) - aspect = wu.get_metadata()["metadata"].aspect - assert aspect.hyperParams is None - assert aspect.trainingMetrics is None +def test_traverse_mlflow_search_func(source): + expected_items = ["a", "b", "c", "d", "e"] - def test_traverse_mlflow_search_func(source): - expected_items = ["a", "b", "c", "d", "e"] + items = list(source._traverse_mlflow_search_func(dummy_search_func)) - items = list(source._traverse_mlflow_search_func(dummy_search_func)) + assert items == expected_items - assert items == expected_items - def test_traverse_mlflow_search_func_with_kwargs(source): - expected_items = ["A", "B", "C", "D", "E"] +def test_traverse_mlflow_search_func_with_kwargs(source): + expected_items = ["A", "B", "C", "D", "E"] + + items = list(source._traverse_mlflow_search_func(dummy_search_func, case="upper")) + + assert items == expected_items - items = list( - source._traverse_mlflow_search_func(dummy_search_func, case="upper") - ) - assert items == expected_items +def test_make_external_link_local(source, model_version): + expected_url = None - def test_make_external_link_local(source, model_version): - expected_url = None + url = source._make_external_url(model_version) - url = source._make_external_url(model_version) + assert url == expected_url - assert url == expected_url - def test_make_external_link_remote(source, model_version): - tracking_uri_remote = "https://dummy-mlflow-tracking-server.org" - source.client = MlflowClient(tracking_uri=tracking_uri_remote) - expected_url = f"{tracking_uri_remote}/#/models/{model_version.name}/versions/{model_version.version}" +def test_make_external_link_remote(source, model_version): + tracking_uri_remote = "https://dummy-mlflow-tracking-server.org" + source.client = MlflowClient(tracking_uri=tracking_uri_remote) + expected_url = f"{tracking_uri_remote}/#/models/{model_version.name}/versions/{model_version.version}" - url = source._make_external_url(model_version) + url = source._make_external_url(model_version) - assert url == expected_url + assert url == expected_url From f378fb6c8066027fae671cb63a4ec3db60dd9744 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Tue, 30 Jan 2024 04:33:17 +0900 Subject: [PATCH 14/14] docs: Add slack survey page (#9590) Co-authored-by: Harshal Sheth --- docs-website/docusaurus.config.js | 2 +- docs-website/src/pages/slack/index.js | 48 +++++++++++++++++++ .../src/pages/slack/slacksurvey.module.scss | 0 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 docs-website/src/pages/slack/index.js create mode 100644 docs-website/src/pages/slack/slacksurvey.module.scss diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index 22edf749acaede..6138f33244d037 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -62,7 +62,7 @@ module.exports = { position: "right", items: [ { - href: "https://slack.datahubproject.io", + to: "/slack", label: "Join Slack", }, { diff --git a/docs-website/src/pages/slack/index.js b/docs-website/src/pages/slack/index.js new file mode 100644 index 00000000000000..c85a1eefe55450 --- /dev/null +++ b/docs-website/src/pages/slack/index.js @@ -0,0 +1,48 @@ +import React, { useEffect } from 'react'; +import Layout from '@theme/Layout'; +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; + +function SlackSurvey() { + const { siteConfig = {} } = useDocusaurusContext(); + + useEffect(() => { + const script = document.createElement('script'); + script.src = "//js.hsforms.net/forms/embed/v2.js"; + script.async = true; + script.type = 'text/javascript'; + document.body.appendChild(script); + + script.onload = () => { + if (window.hbspt) { + window.hbspt.forms.create({ + region: "na1", + portalId: "14552909", + formId: "91357965-a8dc-4e20-875e-5f87e6b9defb", + target: '#hubspotForm' // Targeting the div with the specific ID + }); + } + }; + + return () => { + document.body.removeChild(script); + }; + }, []); + + return ( + +
+
+
+

Join the DataHub Slack Community!

+
We will send the link to join our Slack community to your email.
+
+
+
+
+
+ ); +} + +export default SlackSurvey; diff --git a/docs-website/src/pages/slack/slacksurvey.module.scss b/docs-website/src/pages/slack/slacksurvey.module.scss new file mode 100644 index 00000000000000..e69de29bb2d1d6