From b3a76cb30814b4026f7835dece29c9165e3c330c Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Mon, 16 Dec 2024 20:29:13 +0100 Subject: [PATCH 01/21] feat(ingest/airflow): Add way to disable Airflow plugin without a restart (#12098) --- docs/lineage/airflow.md | 31 +++++++++++++++++++ .../datahub_listener.py | 23 ++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index 72b5cbf57592d3..345213a0672d37 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -339,6 +339,37 @@ TypeError: on_task_instance_success() missing 3 required positional arguments: ' The solution is to upgrade `acryl-datahub-airflow-plugin>=0.12.0.4` or upgrade `pluggy>=1.2.0`. See this [PR](https://github.com/datahub-project/datahub/pull/9365) for details. +### Disabling the DataHub Plugin v2 + +There are two ways to disable the DataHub Plugin v2: + +#### 1. Disable via Configuration + +Set the `datahub.enabled` configuration property to `False` in the `airflow.cfg` file and restart the Airflow environment to reload the configuration and disable the plugin. + +```ini title="airflow.cfg" +[datahub] +enabled = False +``` + +#### 2. Disable via Airflow Variable (Kill-Switch) + +If a restart is not possible and you need a faster way to disable the plugin, you can use the kill-switch. Create and set the `datahub_airflow_plugin_disable_listener` Airflow variable to `true`. This ensures that the listener won't process anything. + +#### Command Line + +```shell +airflow variables set datahub_airflow_plugin_disable_listener true +``` + +#### Airflow UI + +1. Go to Admin -> Variables. +2. Click the "+" symbol to create a new variable. +3. Set the key to `datahub_airflow_plugin_disable_listener` and the value to `true`. + +This will immediately disable the plugin without requiring a restart. + ## Compatibility We no longer officially support Airflow <2.3. However, you can use older versions of `acryl-datahub-airflow-plugin` with older versions of Airflow. diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py index aa7b3108f64f1e..640991a90a1d28 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py @@ -9,6 +9,7 @@ import airflow import datahub.emitter.mce_builder as builder +from airflow.models import Variable from airflow.models.serialized_dag import SerializedDagModel from datahub.api.entities.datajob import DataJob from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult @@ -78,6 +79,8 @@ def hookimpl(f: _F) -> _F: # type: ignore[misc] # noqa: F811 ) _DATAHUB_CLEANUP_DAG = "Datahub_Cleanup" +KILL_SWITCH_VARIABLE_NAME = "datahub_airflow_plugin_disable_listener" + def get_airflow_plugin_listener() -> Optional["DataHubListener"]: # Using globals instead of functools.lru_cache to make testing easier. @@ -364,6 +367,12 @@ def _extract_lineage( redact_with_exclusions(v) ) + def check_kill_switch(self): + if Variable.get(KILL_SWITCH_VARIABLE_NAME, "false").lower() == "true": + logger.debug("DataHub listener disabled by kill switch") + return True + return False + @hookimpl @run_in_thread def on_task_instance_running( @@ -372,6 +381,8 @@ def on_task_instance_running( task_instance: "TaskInstance", session: "Session", # This will always be QUEUED ) -> None: + if self.check_kill_switch(): + return self._set_log_level() # This if statement mirrors the logic in https://github.com/OpenLineage/OpenLineage/pull/508. @@ -454,6 +465,9 @@ def on_task_instance_running( f"DataHub listener finished processing notification about task instance start for {task_instance.task_id}" ) + self.materialize_iolets(datajob) + + def materialize_iolets(self, datajob: DataJob) -> None: if self.config.materialize_iolets: for outlet in datajob.outlets: reported_time: int = int(time.time() * 1000) @@ -541,6 +555,9 @@ def on_task_instance_finish( def on_task_instance_success( self, previous_state: None, task_instance: "TaskInstance", session: "Session" ) -> None: + if self.check_kill_switch(): + return + self._set_log_level() logger.debug( @@ -556,6 +573,9 @@ def on_task_instance_success( def on_task_instance_failed( self, previous_state: None, task_instance: "TaskInstance", session: "Session" ) -> None: + if self.check_kill_switch(): + return + self._set_log_level() logger.debug( @@ -696,6 +716,9 @@ def on_dag_start(self, dag_run: "DagRun") -> None: @hookimpl @run_in_thread def on_dag_run_running(self, dag_run: "DagRun", msg: str) -> None: + if self.check_kill_switch(): + return + self._set_log_level() logger.debug( From c97fd1f8c01c170bdb7aaf176f5bf44f3b3ed4c4 Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Tue, 17 Dec 2024 01:02:05 +0530 Subject: [PATCH 02/21] fix(ingest/tableau): honor the key projectNameWithin in pagination (#12107) --- .../datahub/ingestion/source/tableau/tableau.py | 1 - .../ingestion/source/tableau/tableau_common.py | 17 ++++++++++++----- .../integration/tableau/test_tableau_ingest.py | 2 ++ .../tests/unit/test_tableau_source.py | 10 ++++++++-- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 6844b8a425a7b6..2940b1f47dd56b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -1290,7 +1290,6 @@ def get_connection_objects( page_size = page_size_override or self.config.page_size filter_pages = get_filter_pages(query_filter, page_size) - for filter_page in filter_pages: has_next_page = 1 current_cursor: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py index c5d14e0afe15a5..61b56c4bee5bda 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py @@ -975,15 +975,22 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]: # a few ten thousand, then tableau server responds with empty response # causing below error: # tableauserverclient.server.endpoint.exceptions.NonXMLResponseError: b'' + + # in practice, we only do pagination if len(query_filter.keys()) == 1 + if len(query_filter.keys()) != 1: + return filter_pages + + current_key = (list(query_filter.keys()))[0] + if ( - len(query_filter.keys()) == 1 - and query_filter.get(c.ID_WITH_IN) - and isinstance(query_filter[c.ID_WITH_IN], list) + current_key in [c.ID_WITH_IN, c.PROJECT_NAME_WITH_IN] + and query_filter.get(current_key) + and isinstance(query_filter[current_key], list) ): - ids = query_filter[c.ID_WITH_IN] + ids = query_filter[current_key] filter_pages = [ { - c.ID_WITH_IN: ids[ + current_key: ids[ start : ( start + page_size if start + page_size < len(ids) else len(ids) ) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 5b557efdab0bb0..4f7b371c187f0d 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -61,6 +61,7 @@ "projects": ["default", "Project 2", "Samples"], "extract_project_hierarchy": False, "page_size": 1000, + "workbook_page_size": 1000, "ingest_tags": True, "ingest_owner": True, "ingest_tables_external": True, @@ -674,6 +675,7 @@ def test_tableau_ingest_with_platform_instance( "platform_instance": "acryl_site1", "projects": ["default", "Project 2"], "page_size": 1000, + "workbook_page_size": 1000, "ingest_tags": True, "ingest_owner": True, "ingest_tables_external": True, diff --git a/metadata-ingestion/tests/unit/test_tableau_source.py b/metadata-ingestion/tests/unit/test_tableau_source.py index c81aa0bd8a1b1a..44e59decaecbd7 100644 --- a/metadata-ingestion/tests/unit/test_tableau_source.py +++ b/metadata-ingestion/tests/unit/test_tableau_source.py @@ -182,8 +182,14 @@ def test_get_filter_pages_simple(): assert get_filter_pages(filter_dict, 10) == [filter_dict] -def test_get_filter_pages_non_id_large_filter_passthrough(): - projects = [f"project{i}" for i in range(20000)] +def test_get_filter_pages_non_id_large_filter(): + projects = [f"project{i}" for i in range(10)] + filter_dict = {c.PROJECT_NAME_WITH_IN: projects} + assert get_filter_pages(filter_dict, 10) == [filter_dict] + + +def test_get_filter_pages_for_single_key(): + projects = ["project1"] filter_dict = {c.PROJECT_NAME_WITH_IN: projects} assert get_filter_pages(filter_dict, 10) == [filter_dict] From 0ea2e36226c03ebbe10387c18e65b9732803b0cf Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Mon, 16 Dec 2024 20:38:24 +0100 Subject: [PATCH 03/21] fix(ingest/datahub): Use server side cursor instead of local one (#12129) --- .../source/datahub/datahub_database_reader.py | 62 ++++++++++++------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py index faa281097de4cd..80906ca63115f5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py @@ -147,6 +147,47 @@ def query(self) -> str: version """ + def execute_server_cursor( + self, query: str, params: Dict[str, Any] + ) -> Iterable[Dict[str, Any]]: + with self.engine.connect() as conn: + if self.engine.dialect.name == "postgresql": + with conn.begin(): # Transaction required for PostgreSQL server-side cursor + conn = conn.execution_options( + stream_results=True, + yield_per=self.config.database_query_batch_size, + ) + result = conn.execute(query, params) + for row in result: + yield dict(row) + elif self.engine.dialect.name == "mysql": # MySQL + import MySQLdb + + with contextlib.closing( + conn.connection.cursor(MySQLdb.cursors.SSCursor) + ) as cursor: + logger.debug(f"Using Cursor type: {cursor.__class__.__name__}") + cursor.execute(query, params) + + columns = [desc[0] for desc in cursor.description] + while True: + rows = cursor.fetchmany(self.config.database_query_batch_size) + if not rows: + break # Use break instead of return in generator + for row in rows: + yield dict(zip(columns, row)) + else: + raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}") + + def _get_rows( + self, from_createdon: datetime, stop_time: datetime + ) -> Iterable[Dict[str, Any]]: + params = { + "exclude_aspects": list(self.config.exclude_aspects), + "since_createdon": from_createdon.strftime(DATETIME_FORMAT), + } + yield from self.execute_server_cursor(self.query, params) + def get_aspects( self, from_createdon: datetime, stop_time: datetime ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]: @@ -159,27 +200,6 @@ def get_aspects( if mcp: yield mcp, row["createdon"] - def _get_rows( - self, from_createdon: datetime, stop_time: datetime - ) -> Iterable[Dict[str, Any]]: - with self.engine.connect() as conn: - with contextlib.closing(conn.connection.cursor()) as cursor: - cursor.execute( - self.query, - { - "exclude_aspects": list(self.config.exclude_aspects), - "since_createdon": from_createdon.strftime(DATETIME_FORMAT), - }, - ) - - columns = [desc[0] for desc in cursor.description] - while True: - rows = cursor.fetchmany(self.config.database_query_batch_size) - if not rows: - return - for row in rows: - yield dict(zip(columns, row)) - def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]: """ Fetches all soft-deleted entities from the database. From 74927969aa403e6ca77834d44309a2d49f1fc986 Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Tue, 17 Dec 2024 01:25:58 +0530 Subject: [PATCH 04/21] feat(ingest/tableau): verify role assignment to user in `test_connection` (#12042) Co-authored-by: Harshal Sheth --- .../ingestion/source/tableau/tableau.py | 44 +++++- .../source/tableau/tableau_constant.py | 2 + .../source/tableau/tableau_server_wrapper.py | 33 ++++ .../source/tableau/tableau_validation.py | 48 ++++++ .../tableau/test_tableau_ingest.py | 147 ++++++++++++------ 5 files changed, 227 insertions(+), 47 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 2940b1f47dd56b..6cc2220d90fd93 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -111,6 +111,8 @@ tableau_field_to_schema_field, workbook_graphql_query, ) +from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo +from datahub.ingestion.source.tableau.tableau_validation import check_user_role from datahub.metadata.com.linkedin.pegasus2avro.common import ( AuditStamp, ChangeAuditStamps, @@ -167,7 +169,7 @@ try: # On earlier versions of the tableauserverclient, the NonXMLResponseError - # was thrown when reauthentication was needed. We'll keep both exceptions + # was thrown when reauthentication was necessary. We'll keep both exceptions # around for now, but can remove this in the future. from tableauserverclient.server.endpoint.exceptions import ( # type: ignore NotSignedInError, @@ -632,6 +634,33 @@ class TableauSourceReport(StaleEntityRemovalSourceReport): num_upstream_table_lineage_failed_parse_sql: int = 0 num_upstream_fine_grained_lineage_failed_parse_sql: int = 0 num_hidden_assets_skipped: int = 0 + logged_in_user: List[UserInfo] = [] + + +def report_user_role(report: TableauSourceReport, server: Server) -> None: + title: str = "Insufficient Permissions" + message: str = "The user must have the `Site Administrator Explorer` role to perform metadata ingestion." + try: + # TableauSiteSource instance is per site, so each time we need to find-out user detail + # the site-role might be different on another site + logged_in_user: UserInfo = UserInfo.from_server(server=server) + + if not logged_in_user.is_site_administrator_explorer(): + report.warning( + title=title, + message=message, + context=f"user-name={logged_in_user.user_name}, role={logged_in_user.site_role}, site_id={logged_in_user.site_id}", + ) + + report.logged_in_user.append(logged_in_user) + + except Exception as e: + report.warning( + title=title, + message="Failed to verify the user's role. The user must have `Site Administrator Explorer` role.", + context=f"{e}", + exc=e, + ) @platform_name("Tableau") @@ -676,6 +705,7 @@ def _authenticate(self, site_content_url: str) -> None: try: logger.info(f"Authenticated to Tableau site: '{site_content_url}'") self.server = self.config.make_tableau_client(site_content_url) + report_user_role(report=self.report, server=self.server) # Note that we're not catching ConfigurationError, since we want that to throw. except ValueError as e: self.report.failure( @@ -689,9 +719,17 @@ def test_connection(config_dict: dict) -> TestConnectionReport: test_report = TestConnectionReport() try: source_config = TableauConfig.parse_obj_allow_extras(config_dict) - source_config.make_tableau_client(source_config.site) + + server = source_config.make_tableau_client(source_config.site) + test_report.basic_connectivity = CapabilityReport(capable=True) + + test_report.capability_report = check_user_role( + logged_in_user=UserInfo.from_server(server=server) + ) + except Exception as e: + logger.warning(f"{e}", exc_info=e) test_report.basic_connectivity = CapabilityReport( capable=False, failure_reason=str(e) ) @@ -831,6 +869,8 @@ def __init__( # when emitting custom SQL data sources. self.custom_sql_ids_being_used: List[str] = [] + report_user_role(report=report, server=server) + @property def no_env_browse_prefix(self) -> str: # Prefix to use with browse path (v1) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py index d1dd0d92819991..ea0878143ef354 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py @@ -81,3 +81,5 @@ PROJECT = "Project" SITE = "Site" IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql" +SITE_PERMISSION = "sitePermission" +SITE_ROLE = "SiteAdministratorExplorer" diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py new file mode 100644 index 00000000000000..f309622d12b91b --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass + +from tableauserverclient import Server, UserItem + +from datahub.ingestion.source.tableau import tableau_constant as c + + +@dataclass +class UserInfo: + user_name: str + site_role: str + site_id: str + + def is_site_administrator_explorer(self): + return self.site_role == c.SITE_ROLE + + @staticmethod + def from_server(server: Server) -> "UserInfo": + assert server.user_id, "make the connection with tableau" + + user: UserItem = server.users.get_by_id(server.user_id) + + assert user.site_role, "site_role is not available" # to silent the lint + + assert user.name, "user name is not available" # to silent the lint + + assert server.site_id, "site identifier is not available" # to silent the lint + + return UserInfo( + user_name=user.name, + site_role=user.site_role, + site_id=server.site_id, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py new file mode 100644 index 00000000000000..4a703faf6091b3 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py @@ -0,0 +1,48 @@ +import logging +from typing import Dict, Union + +from datahub.ingestion.api.source import CapabilityReport, SourceCapability +from datahub.ingestion.source.tableau import tableau_constant as c +from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo + +logger = logging.getLogger(__name__) + + +def check_user_role( + logged_in_user: UserInfo, +) -> Dict[Union[SourceCapability, str], CapabilityReport]: + capability_dict: Dict[Union[SourceCapability, str], CapabilityReport] = { + c.SITE_PERMISSION: CapabilityReport( + capable=True, + ) + } + + failure_reason: str = ( + "The user does not have the `Site Administrator Explorer` role." + ) + + mitigation_message_prefix: str = ( + "Assign `Site Administrator Explorer` role to the user" + ) + mitigation_message_suffix: str = "Refer to the setup guide: https://datahubproject.io/docs/quick-ingestion-guides/tableau/setup" + + try: + # TODO: Add check for `Enable Derived Permissions` + if not logged_in_user.is_site_administrator_explorer(): + capability_dict[c.SITE_PERMISSION] = CapabilityReport( + capable=False, + failure_reason=f"{failure_reason} Their current role is {logged_in_user.site_role}.", + mitigation_message=f"{mitigation_message_prefix} `{logged_in_user.user_name}`. {mitigation_message_suffix}", + ) + + return capability_dict + + except Exception as e: + logger.warning(msg=e, exc_info=e) + capability_dict[c.SITE_PERMISSION] = CapabilityReport( + capable=False, + failure_reason="Failed to verify user role.", + mitigation_message=f"{mitigation_message_prefix}. {mitigation_message_suffix}", # user is unknown + ) + + return capability_dict diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 4f7b371c187f0d..4b2ac96931b950 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -7,6 +7,7 @@ import pytest from freezegun import freeze_time +from pydantic import ValidationError from requests.adapters import ConnectionError from tableauserverclient import PermissionsRule, Server from tableauserverclient.models import ( @@ -21,7 +22,9 @@ from datahub.emitter.mce_builder import DEFAULT_ENV, make_schema_field_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.run.pipeline import Pipeline, PipelineContext, PipelineInitError +from datahub.ingestion.api.source import TestConnectionReport +from datahub.ingestion.run.pipeline import Pipeline, PipelineContext +from datahub.ingestion.source.tableau import tableau_constant as c from datahub.ingestion.source.tableau.tableau import ( TableauConfig, TableauSiteSource, @@ -572,52 +575,28 @@ def test_extract_all_project(pytestconfig, tmp_path, mock_datahub_graph): def test_value_error_projects_and_project_pattern( pytestconfig, tmp_path, mock_datahub_graph ): - # Ingestion should raise ValueError - output_file_name: str = "tableau_project_pattern_precedence_mces.json" - golden_file_name: str = "tableau_project_pattern_precedence_mces_golden.json" - new_config = config_source_default.copy() new_config["projects"] = ["default"] new_config["project_pattern"] = {"allow": ["^Samples$"]} with pytest.raises( - PipelineInitError, + ValidationError, match=r".*projects is deprecated. Please use project_path_pattern only.*", ): - tableau_ingest_common( - pytestconfig, - tmp_path, - mock_data(), - golden_file_name, - output_file_name, - mock_datahub_graph, - pipeline_config=new_config, - ) + TableauConfig.parse_obj(new_config) def test_project_pattern_deprecation(pytestconfig, tmp_path, mock_datahub_graph): - # Ingestion should raise ValueError - output_file_name: str = "tableau_project_pattern_deprecation_mces.json" - golden_file_name: str = "tableau_project_pattern_deprecation_mces_golden.json" - new_config = config_source_default.copy() del new_config["projects"] new_config["project_pattern"] = {"allow": ["^Samples$"]} new_config["project_path_pattern"] = {"allow": ["^Samples$"]} with pytest.raises( - PipelineInitError, + ValidationError, match=r".*project_pattern is deprecated. Please use project_path_pattern only*", ): - tableau_ingest_common( - pytestconfig, - tmp_path, - mock_data(), - golden_file_name, - output_file_name, - mock_datahub_graph, - pipeline_config=new_config, - ) + TableauConfig.parse_obj(new_config) def test_project_path_pattern_allow(pytestconfig, tmp_path, mock_datahub_graph): @@ -1298,31 +1277,21 @@ def test_hidden_asset_tags(pytestconfig, tmp_path, mock_datahub_graph): @pytest.mark.integration def test_hidden_assets_without_ingest_tags(pytestconfig, tmp_path, mock_datahub_graph): enable_logging() - output_file_name: str = "tableau_hidden_asset_tags_error_mces.json" - golden_file_name: str = "tableau_hidden_asset_tags_error_mces_golden.json" new_config = config_source_default.copy() new_config["tags_for_hidden_assets"] = ["hidden", "private"] new_config["ingest_tags"] = False with pytest.raises( - PipelineInitError, + ValidationError, match=r".*tags_for_hidden_assets is only allowed with ingest_tags enabled.*", ): - tableau_ingest_common( - pytestconfig, - tmp_path, - mock_data(), - golden_file_name, - output_file_name, - mock_datahub_graph, - pipeline_config=new_config, - ) + TableauConfig.parse_obj(new_config) @freeze_time(FROZEN_TIME) @pytest.mark.integration -def test_permission_mode_switched_error(pytestconfig, tmp_path, mock_datahub_graph): +def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph): with mock.patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, @@ -1359,11 +1328,99 @@ def test_permission_mode_switched_error(pytestconfig, tmp_path, mock_datahub_gra warnings = list(reporter.warnings) - assert len(warnings) == 1 + assert len(warnings) == 2 + + assert warnings[0].title == "Insufficient Permissions" - assert warnings[0].title == "Derived Permission Error" + assert warnings[1].title == "Derived Permission Error" - assert warnings[0].message == ( + assert warnings[1].message == ( "Turn on your derived permissions. See for details " "https://community.tableau.com/s/question/0D54T00000QnjHbSAJ/how-to-fix-the-permissionsmodeswitched-error" ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_connection_report_test(requests_mock): + server_info_response = """ + + + foo + 2.4 + + + + """ + + requests_mock.register_uri( + "GET", + "https://do-not-connect/api/2.4/serverInfo", + text=server_info_response, + status_code=200, + headers={"Content-Type": "application/xml"}, + ) + + signin_response = """ + + + + + + + """ + + requests_mock.register_uri( + "POST", + "https://do-not-connect/api/2.4/auth/signin", + text=signin_response, + status_code=200, + headers={"Content-Type": "application/xml"}, + ) + + user_by_id_response = """ + + + + """ + + requests_mock.register_uri( + "GET", + "https://do-not-connect/api/2.4/sites/fake_site_luid/users/fake_user_id", + text=user_by_id_response, + status_code=200, + headers={"Content-Type": "application/xml"}, + ) + + report: TestConnectionReport = TableauSource.test_connection(config_source_default) + + assert report + assert report.capability_report + assert report.capability_report.get(c.SITE_PERMISSION) + assert report.capability_report[c.SITE_PERMISSION].capable + + # Role other than SiteAdministratorExplorer + user_by_id_response = """ + + + + """ + + requests_mock.register_uri( + "GET", + "https://do-not-connect/api/2.4/sites/fake_site_luid/users/fake_user_id", + text=user_by_id_response, + status_code=200, + headers={"Content-Type": "application/xml"}, + ) + + report = TableauSource.test_connection(config_source_default) + + assert report + assert report.capability_report + assert report.capability_report.get(c.SITE_PERMISSION) + assert report.capability_report[c.SITE_PERMISSION].capable is False + assert ( + report.capability_report[c.SITE_PERMISSION].failure_reason + == "The user does not have the `Site Administrator Explorer` role. Their current role is Explorer." + ) From ca6f435d03cfc44cc18c104435d70cd7781fc0f1 Mon Sep 17 00:00:00 2001 From: kousiknandy Date: Mon, 16 Dec 2024 20:08:21 +0000 Subject: [PATCH 05/21] docs(ingest): fix sink recipe to correct config parameter (#12132) --- metadata-ingestion/sink_docs/metadata-file.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/sink_docs/metadata-file.md b/metadata-ingestion/sink_docs/metadata-file.md index 49ca3c75397af4..36c868828070ed 100644 --- a/metadata-ingestion/sink_docs/metadata-file.md +++ b/metadata-ingestion/sink_docs/metadata-file.md @@ -25,7 +25,7 @@ source: sink: type: file config: - path: ./path/to/mce/file.json + filename: ./path/to/mce/file.json ``` ## Config details From d5e379a94d861b5b129176b5a41f4b0d609ec35e Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Mon, 16 Dec 2024 15:30:25 -0500 Subject: [PATCH 06/21] feat(ui) Add finishing touches to the structured props feature (#12111) --- .../SchemaFieldPropertiesEntity.tsx | 6 ++---- .../components/styled/DeprecationPill.tsx | 2 -- ...ngInput.tsx => MultipleOpenEndedInput.tsx} | 13 +++++++----- .../styled/StructuredProperty/NumberInput.tsx | 20 ++++++++++++++++++- .../styled/StructuredProperty/StringInput.tsx | 13 +++++++++--- .../StructuredPropertyInput.tsx | 6 +++++- .../profile/header/EntityHeader.tsx | 1 + .../containers/profile/header/EntityName.tsx | 2 -- .../AllowedValuesDrawer.tsx | 1 + .../DisplayPreferences.tsx | 3 ++- .../StructuredPropsDrawer.tsx | 1 + .../govern/structuredProperties/cacheUtils.ts | 1 - .../src/app/preview/DefaultPreviewCard.tsx | 2 +- datahub-web-react/src/graphql/search.graphql | 1 + .../authorization/PoliciesConfig.java | 1 + 15 files changed, 52 insertions(+), 21 deletions(-) rename datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/{MultipleStringInput.tsx => MultipleOpenEndedInput.tsx} (87%) diff --git a/datahub-web-react/src/app/entity/schemaField/SchemaFieldPropertiesEntity.tsx b/datahub-web-react/src/app/entity/schemaField/SchemaFieldPropertiesEntity.tsx index 2c59c476195d0b..fdc0e33d77a057 100644 --- a/datahub-web-react/src/app/entity/schemaField/SchemaFieldPropertiesEntity.tsx +++ b/datahub-web-react/src/app/entity/schemaField/SchemaFieldPropertiesEntity.tsx @@ -35,11 +35,9 @@ export class SchemaFieldPropertiesEntity implements Entity { // Currently unused. getPathName = () => 'schemaField'; - // Currently unused. - getEntityName = () => 'schemaField'; + getEntityName = () => 'Column'; - // Currently unused. - getCollectionName = () => 'schemaFields'; + getCollectionName = () => 'Columns'; // Currently unused. renderProfile = (_: string) => <>; diff --git a/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx b/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx index 08e9636f760de5..613264709ac23c 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx @@ -19,8 +19,6 @@ const DeprecatedContainer = styled.div` justify-content: center; align-items: center; color: #cd0d24; - margin-left: 0px; - margin-right: 8px; padding-top: 8px; padding-bottom: 8px; padding-right: 4px; diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleStringInput.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleOpenEndedInput.tsx similarity index 87% rename from datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleStringInput.tsx rename to datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleOpenEndedInput.tsx index fe6c0bbb99ce22..fe6cd1115419ae 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleStringInput.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleOpenEndedInput.tsx @@ -4,6 +4,8 @@ import React from 'react'; import styled from 'styled-components'; import { ANTD_GRAY_V2 } from '../../../constants'; +const MultiStringWrapper = styled.div``; + const StyledInput = styled(Input)` width: 75%; min-width: 350px; @@ -29,10 +31,11 @@ const DeleteButton = styled(Button)` interface Props { selectedValues: any[]; + inputType?: string; updateSelectedValues: (values: any[]) => void; } -export default function MultipleStringInput({ selectedValues, updateSelectedValues }: Props) { +export default function MultipleOpenEndedInput({ selectedValues, updateSelectedValues, inputType = 'text' }: Props) { function updateInput(text: string, index: number) { const updatedValues = selectedValues.length > 0 ? selectedValues.map((value, i) => (i === index ? text : value)) : [text]; @@ -53,14 +56,14 @@ export default function MultipleStringInput({ selectedValues, updateSelectedValu } return ( -
+ {selectedValues.length > 1 && selectedValues.map((selectedValue, index) => { const key = `${index}`; return ( updateInput(e.target.value, index)} /> @@ -70,7 +73,7 @@ export default function MultipleStringInput({ selectedValues, updateSelectedValu })} {selectedValues.length <= 1 && ( updateInput(e.target.value, 0)} /> @@ -78,6 +81,6 @@ export default function MultipleStringInput({ selectedValues, updateSelectedValu + Add More -
+ ); } diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/NumberInput.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/NumberInput.tsx index c56d85db7ef712..f4cedc4cf80ee5 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/NumberInput.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/NumberInput.tsx @@ -1,7 +1,9 @@ import { Input } from 'antd'; import React, { ChangeEvent } from 'react'; import styled from 'styled-components'; +import { PropertyCardinality } from '@src/types.generated'; import { ANTD_GRAY_V2 } from '../../../constants'; +import MultipleOpenEndedInput from './MultipleOpenEndedInput'; const StyledInput = styled(Input)` border: 1px solid ${ANTD_GRAY_V2[6]}; @@ -10,15 +12,31 @@ const StyledInput = styled(Input)` interface Props { selectedValues: any[]; + cardinality?: PropertyCardinality | null; updateSelectedValues: (values: string[] | number[]) => void; } -export default function NumberInput({ selectedValues, updateSelectedValues }: Props) { +export default function NumberInput({ selectedValues, cardinality, updateSelectedValues }: Props) { function updateInput(event: ChangeEvent) { const number = Number(event.target.value); updateSelectedValues([number]); } + function updateMultipleValues(values: string[] | number[]) { + const numbers = values.map((v) => Number(v)); + updateSelectedValues(numbers); + } + + if (cardinality === PropertyCardinality.Multiple) { + return ( + + ); + } + return ( ; + return ; } - return ; + return ( + + ); } diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/StructuredPropertyInput.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/StructuredPropertyInput.tsx index 894a304335b0f6..305347ee0bce80 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/StructuredPropertyInput.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/StructuredPropertyInput.tsx @@ -60,7 +60,11 @@ export default function StructuredPropertyInput({ )} {!allowedValues && valueType.info.type === StdDataType.Number && ( - + )} {!allowedValues && valueType.info.type === StdDataType.Urn && ( scrollToBottom(), 0); }} color="violet" + type="button" > Add diff --git a/datahub-web-react/src/app/govern/structuredProperties/DisplayPreferences.tsx b/datahub-web-react/src/app/govern/structuredProperties/DisplayPreferences.tsx index 260c91ef93207c..95823de0f27c40 100644 --- a/datahub-web-react/src/app/govern/structuredProperties/DisplayPreferences.tsx +++ b/datahub-web-react/src/app/govern/structuredProperties/DisplayPreferences.tsx @@ -153,7 +153,8 @@ const DisplayPreferences = ({ clickable={false} />  is already being shown on asset previews, but only one property is allowed at a time. - Do you want to replace the current property? This will hide PropVal on all asset previews. + Do you want to replace the current property? This will hide {getDisplayName(badgeProperty)}{' '} + on all asset previews.

} /> diff --git a/datahub-web-react/src/app/govern/structuredProperties/StructuredPropsDrawer.tsx b/datahub-web-react/src/app/govern/structuredProperties/StructuredPropsDrawer.tsx index 4b2bbaaf96826b..debffeac7d583c 100644 --- a/datahub-web-react/src/app/govern/structuredProperties/StructuredPropsDrawer.tsx +++ b/datahub-web-react/src/app/govern/structuredProperties/StructuredPropsDrawer.tsx @@ -192,6 +192,7 @@ const StructuredPropsDrawer = ({ form.validateFields().then(() => { const createInput = { ...form.getFieldsValue(), + qualifiedName: form.getFieldValue('qualifiedName') || undefined, valueType: valueTypes.find((type) => type.value === form.getFieldValue('valueType'))?.urn, allowedValues, cardinality, diff --git a/datahub-web-react/src/app/govern/structuredProperties/cacheUtils.ts b/datahub-web-react/src/app/govern/structuredProperties/cacheUtils.ts index 590189d06e6b16..c8052784c6972a 100644 --- a/datahub-web-react/src/app/govern/structuredProperties/cacheUtils.ts +++ b/datahub-web-react/src/app/govern/structuredProperties/cacheUtils.ts @@ -17,7 +17,6 @@ const addToCache = (existingProperties, newProperty) => { allowedValues: newProperty.definition.allowedValues, created: newProperty.definition.created, lastModified: newProperty.definition.lastModified, - filterStatus: newProperty.definition.filterStatus, }, settings: { isHidden: newProperty.settings.isHidden, diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx index 4c8948a6664e07..a19862e83ae510 100644 --- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx +++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx @@ -68,6 +68,7 @@ const TitleContainer = styled.div` const EntityTitleContainer = styled.div` display: flex; align-items: center; + gap: 8px; `; const EntityTitle = styled(Typography.Text)<{ $titleSizePx?: number }>` @@ -77,7 +78,6 @@ const EntityTitle = styled(Typography.Text)<{ $titleSizePx?: number }>` } &&& { - margin-right 8px; font-size: ${(props) => props.$titleSizePx || 16}px; font-weight: 600; vertical-align: middle; diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index ce0fde27f4c425..58c9a51f3d7e90 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -963,6 +963,7 @@ fragment facetFields on FacetMetadata { entity { urn type + ...entityDisplayNameFields ... on Tag { name properties { diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java index d701c8fc8be035..80a11ab98bbf4a 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java @@ -219,6 +219,7 @@ public class PoliciesConfig { MANAGE_BUSINESS_ATTRIBUTE_PRIVILEGE, MANAGE_CONNECTIONS_PRIVILEGE, MANAGE_STRUCTURED_PROPERTIES_PRIVILEGE, + VIEW_STRUCTURED_PROPERTIES_PAGE_PRIVILEGE, MANAGE_DOCUMENTATION_FORMS_PRIVILEGE, MANAGE_FEATURES_PRIVILEGE, MANAGE_SYSTEM_OPERATIONS_PRIVILEGE); From 6b8d21a2ab49ef01f1bc0096df5b6db42b835bfa Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Mon, 16 Dec 2024 12:50:25 -0800 Subject: [PATCH 07/21] feat(ingest/sqlite): Support sqlite < 3.24.0 (#12137) --- .../utilities/file_backed_collections.py | 37 ++++++++++++++++++- .../utilities/test_file_backed_collections.py | 21 ++++++++--- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py index b0f5022446de15..b8c27666d7f538 100644 --- a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py +++ b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py @@ -1,6 +1,7 @@ import collections import gzip import logging +import os import pathlib import pickle import shutil @@ -33,6 +34,14 @@ logger: logging.Logger = logging.getLogger(__name__) +OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR = ( + os.environ.get("OVERRIDE_SQLITE_VERSION_REQ") or "" +) +OVERRIDE_SQLITE_VERSION_REQUIREMENT = ( + OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR + and OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR.lower() != "false" +) + _DEFAULT_FILE_NAME = "sqlite.db" _DEFAULT_TABLE_NAME = "data" @@ -212,6 +221,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]): _active_object_cache: OrderedDict[str, Tuple[_VT, bool]] = field( init=False, repr=False ) + _use_sqlite_on_conflict: bool = field(repr=False, default=True) def __post_init__(self) -> None: assert ( @@ -232,7 +242,10 @@ def __post_init__(self) -> None: # We use the ON CONFLICT clause to implement UPSERTs with sqlite. # This was added in 3.24.0 from 2018-06-04. # See https://www.sqlite.org/lang_conflict.html - raise RuntimeError("SQLite version 3.24.0 or later is required") + if OVERRIDE_SQLITE_VERSION_REQUIREMENT: + self.use_sqlite_on_conflict = False + else: + raise RuntimeError("SQLite version 3.24.0 or later is required") # We keep a small cache in memory to avoid having to serialize/deserialize # data from the database too often. We use an OrderedDict to build @@ -295,7 +308,7 @@ def _prune_cache(self, num_items_to_prune: int) -> None: values.append(column_serializer(value)) items_to_write.append(tuple(values)) - if items_to_write: + if items_to_write and self._use_sqlite_on_conflict: # Tricky: By using a INSERT INTO ... ON CONFLICT (key) structure, we can # ensure that the rowid remains the same if a value is updated but is # autoincremented when rows are inserted. @@ -312,6 +325,26 @@ def _prune_cache(self, num_items_to_prune: int) -> None: """, items_to_write, ) + else: + for item in items_to_write: + try: + self._conn.execute( + f"""INSERT INTO {self.tablename} ( + key, + value + {''.join(f', {column_name}' for column_name in self.extra_columns.keys())} + ) + VALUES ({', '.join(['?'] *(2 + len(self.extra_columns)))})""", + item, + ) + except sqlite3.IntegrityError: + self._conn.execute( + f"""UPDATE {self.tablename} SET + value = ? + {''.join(f', {column_name} = ?' for column_name in self.extra_columns.keys())} + WHERE key = ?""", + (*item[1:], item[0]), + ) def flush(self) -> None: self._prune_cache(len(self._active_object_cache)) diff --git a/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py b/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py index f4062f9a911453..6230c2e37edc6a 100644 --- a/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py +++ b/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py @@ -15,11 +15,13 @@ ) -def test_file_dict() -> None: +@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False]) +def test_file_dict(use_sqlite_on_conflict: bool) -> None: cache = FileBackedDict[int]( tablename="cache", cache_max_size=10, cache_eviction_batch_size=10, + _use_sqlite_on_conflict=use_sqlite_on_conflict, ) for i in range(100): @@ -92,7 +94,8 @@ def test_file_dict() -> None: cache["a"] = 1 -def test_custom_serde() -> None: +@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False]) +def test_custom_serde(use_sqlite_on_conflict: bool) -> None: @dataclass(frozen=True) class Label: a: str @@ -139,6 +142,7 @@ def deserialize(s: str) -> Main: deserializer=deserialize, # Disable the in-memory cache to force all reads/writes to the DB. cache_max_size=0, + _use_sqlite_on_conflict=use_sqlite_on_conflict, ) first = Main(3, {Label("one", 1): 0.1, Label("two", 2): 0.2}) second = Main(-100, {Label("z", 26): 0.26}) @@ -186,7 +190,8 @@ def test_file_dict_stores_counter() -> None: assert in_memory_counters[i].most_common(2) == cache[str(i)].most_common(2) -def test_file_dict_ordering() -> None: +@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False]) +def test_file_dict_ordering(use_sqlite_on_conflict: bool) -> None: """ We require that FileBackedDict maintains insertion order, similar to Python's built-in dict. This test makes one of each and validates that they behave the same. @@ -196,6 +201,7 @@ def test_file_dict_ordering() -> None: serializer=str, deserializer=int, cache_max_size=1, + _use_sqlite_on_conflict=use_sqlite_on_conflict, ) data = {} @@ -229,12 +235,14 @@ class Pair: @pytest.mark.parametrize("cache_max_size", [0, 1, 10]) -def test_custom_column(cache_max_size: int) -> None: +@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False]) +def test_custom_column(cache_max_size: int, use_sqlite_on_conflict: bool) -> None: cache = FileBackedDict[Pair]( extra_columns={ "x": lambda m: m.x, }, cache_max_size=cache_max_size, + _use_sqlite_on_conflict=use_sqlite_on_conflict, ) cache["first"] = Pair(3, "a") @@ -275,7 +283,8 @@ def test_custom_column(cache_max_size: int) -> None: ] -def test_shared_connection() -> None: +@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False]) +def test_shared_connection(use_sqlite_on_conflict: bool) -> None: with ConnectionWrapper() as connection: cache1 = FileBackedDict[int]( shared_connection=connection, @@ -283,6 +292,7 @@ def test_shared_connection() -> None: extra_columns={ "v": lambda v: v, }, + _use_sqlite_on_conflict=use_sqlite_on_conflict, ) cache2 = FileBackedDict[Pair]( shared_connection=connection, @@ -291,6 +301,7 @@ def test_shared_connection() -> None: "x": lambda m: m.x, "y": lambda m: m.y, }, + _use_sqlite_on_conflict=use_sqlite_on_conflict, ) cache1["a"] = 3 From d0b4f7a7d3d4df062d684fec6017dbced8c2f708 Mon Sep 17 00:00:00 2001 From: kevinkarchacryl Date: Mon, 16 Dec 2024 16:03:11 -0500 Subject: [PATCH 08/21] feat(cli): added cli option for ingestion source (#11980) --- docs/cli.md | 13 +++ docs/how/delete-metadata.md | 10 +- .../src/datahub/cli/ingest_cli.py | 110 ++++++++++++++++++ 3 files changed, 131 insertions(+), 2 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index c633b7f4a38ad3..1c38077d0d12ef 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -115,6 +115,19 @@ datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yaml --dry-run datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yaml -n ``` +#### ingest --list-source-runs + +The `--list-source-runs` option of the `ingest` command lists the previous runs, displaying their run ID, source name, +start time, status, and source URN. This command allows you to filter results using the --urn option for URN-based +filtering or the --source option to filter by source name (partial or complete matches are supported). + +```shell +# List all ingestion runs +datahub ingest --list-source-runs +# Filter runs by a source name containing "demo" +datahub ingest --list-source-runs --source "demo" +``` + #### ingest --preview The `--preview` option of the `ingest` command performs all of the ingestion steps, but limits the processing to only the first 10 workunits produced by the source. diff --git a/docs/how/delete-metadata.md b/docs/how/delete-metadata.md index f720a66ce57652..e36940bf398356 100644 --- a/docs/how/delete-metadata.md +++ b/docs/how/delete-metadata.md @@ -4,7 +4,7 @@ To follow this guide, you'll need the [DataHub CLI](../cli.md). ::: -There are a two ways to delete metadata from DataHub: +There are two ways to delete metadata from DataHub: 1. Delete metadata attached to entities by providing a specific urn or filters that identify a set of urns (delete CLI). 2. Delete metadata created by a single ingestion run (rollback). @@ -233,7 +233,13 @@ To view the ids of the most recent set of ingestion batches, execute datahub ingest list-runs ``` -That will print out a table of all the runs. Once you have an idea of which run you want to roll back, run +That will print out a table of all the runs. To see run statuses or to filter runs by URN/source run + +```shell +datahub ingest list-source-runs +``` + +Once you have an idea of which run you want to roll back, run ```shell datahub ingest show --run-id diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py index 51f095751f7dd9..fcab07a1c2aaf6 100644 --- a/metadata-ingestion/src/datahub/cli/ingest_cli.py +++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py @@ -27,6 +27,7 @@ logger = logging.getLogger(__name__) +INGEST_SRC_TABLE_COLUMNS = ["runId", "source", "startTime", "status", "URN"] RUNS_TABLE_COLUMNS = ["runId", "rows", "created at"] RUN_TABLE_COLUMNS = ["urn", "aspect name", "created at"] @@ -437,6 +438,115 @@ def mcps(path: str) -> None: sys.exit(ret) +@ingest.command() +@click.argument("page_offset", type=int, default=0) +@click.argument("page_size", type=int, default=100) +@click.option("--urn", type=str, default=None, help="Filter by ingestion source URN.") +@click.option( + "--source", type=str, default=None, help="Filter by ingestion source name." +) +@upgrade.check_upgrade +@telemetry.with_telemetry() +def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) -> None: + """List ingestion source runs with their details, optionally filtered by URN or source.""" + + query = """ + query listIngestionRuns($input: ListIngestionSourcesInput!) { + listIngestionSources(input: $input) { + ingestionSources { + urn + name + executions { + executionRequests { + id + result { + startTimeMs + status + } + } + } + } + } + } + """ + + # filter by urn and/or source using CONTAINS + filters = [] + if urn: + filters.append({"field": "urn", "values": [urn], "condition": "CONTAIN"}) + if source: + filters.append({"field": "name", "values": [source], "condition": "CONTAIN"}) + + variables = { + "input": { + "start": page_offset, + "count": page_size, + "filters": filters, + } + } + + client = get_default_graph() + session = client._session + gms_host = client.config.server + + url = f"{gms_host}/api/graphql" + try: + response = session.post(url, json={"query": query, "variables": variables}) + response.raise_for_status() + except Exception as e: + click.echo(f"Error fetching data: {str(e)}") + return + + try: + data = response.json() + except ValueError: + click.echo("Failed to parse JSON response from server.") + return + + if not data: + click.echo("No response received from the server.") + return + + # when urn or source filter does not match, exit gracefully + if ( + not isinstance(data.get("data"), dict) + or "listIngestionSources" not in data["data"] + ): + click.echo("No matching ingestion sources found. Please check your filters.") + return + + ingestion_sources = data["data"]["listIngestionSources"]["ingestionSources"] + if not ingestion_sources: + click.echo("No ingestion sources or executions found.") + return + + rows = [] + for ingestion_source in ingestion_sources: + urn = ingestion_source.get("urn", "N/A") + name = ingestion_source.get("name", "N/A") + + executions = ingestion_source.get("executions", {}).get("executionRequests", []) + for execution in executions: + execution_id = execution.get("id", "N/A") + start_time = execution.get("result", {}).get("startTimeMs", "N/A") + start_time = ( + datetime.fromtimestamp(start_time / 1000).strftime("%Y-%m-%d %H:%M:%S") + if start_time != "N/A" + else "N/A" + ) + status = execution.get("result", {}).get("status", "N/A") + + rows.append([execution_id, name, start_time, status, urn]) + + click.echo( + tabulate( + rows, + headers=INGEST_SRC_TABLE_COLUMNS, + tablefmt="grid", + ) + ) + + @ingest.command() @click.argument("page_offset", type=int, default=0) @click.argument("page_size", type=int, default=100) From 67cdbb079a617261a04611f13b5aa35802aed016 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Tue, 17 Dec 2024 09:36:14 +0100 Subject: [PATCH 09/21] fix(patch): Add Finegrained Lineage patch support for DatajobInputOutput (#4749) (#12146) --- .../DataJobInputOutputPatchBuilder.java | 100 +++++++ .../builder/UpstreamLineagePatchBuilder.java | 2 +- .../FineGrainedLineageTemplateHelper.java | 282 ++++++++++++++++++ .../aspect/patch/template/TemplateUtil.java | 2 +- .../datajob/DataJobInputOutputTemplate.java | 19 ++ .../dataset/UpstreamLineageTemplate.java | 277 +---------------- .../DataJobInputOutputTemplateTest.java | 255 ++++++++++++++++ .../template/UpstreamLineageTemplateTest.java | 36 +++ 8 files changed, 698 insertions(+), 275 deletions(-) create mode 100644 entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/FineGrainedLineageTemplateHelper.java create mode 100644 entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/DataJobInputOutputTemplateTest.java diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java index 6fffb17521ddb7..14fc92a1bf3c86 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java @@ -15,6 +15,8 @@ import com.linkedin.metadata.aspect.patch.PatchOperationType; import com.linkedin.metadata.graph.LineageDirection; import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutableTriple; public class DataJobInputOutputPatchBuilder @@ -24,6 +26,7 @@ public class DataJobInputOutputPatchBuilder private static final String OUTPUT_DATASET_EDGES_PATH_START = "/outputDatasetEdges/"; private static final String INPUT_DATASET_FIELDS_PATH_START = "/inputDatasetFields/"; private static final String OUTPUT_DATASET_FIELDS_PATH_START = "/outputDatasetFields/"; + private static final String FINE_GRAINED_PATH_START = "/fineGrainedLineages/"; // Simplified with just Urn public DataJobInputOutputPatchBuilder addInputDatajobEdge(@Nonnull DataJobUrn dataJobUrn) { @@ -136,6 +139,103 @@ public DataJobInputOutputPatchBuilder addEdge( return this; } + /** + * Adds a field as a fine grained upstream + * + * @param upstreamSchemaField a schema field to be marked as upstream, format: + * urn:li:schemaField(DATASET_URN, COLUMN NAME) + * @param confidenceScore optional, confidence score for the lineage edge. Defaults to 1.0 for + * full confidence + * @param transformationOperation string operation type that describes the transformation + * operation happening in the lineage edge + * @param downstreamSchemaField the downstream schema field this upstream is derived from, format: + * urn:li:schemaField(DATASET_URN, COLUMN NAME) + * @param queryUrn query urn the relationship is derived from + * @return this builder + */ + public DataJobInputOutputPatchBuilder addFineGrainedUpstreamField( + @Nonnull Urn upstreamSchemaField, + @Nullable Float confidenceScore, + @Nonnull String transformationOperation, + @Nonnull Urn downstreamSchemaField, + @Nullable Urn queryUrn) { + Float finalConfidenceScore = getConfidenceScoreOrDefault(confidenceScore); + String finalQueryUrn; + if (queryUrn == null || StringUtils.isBlank(queryUrn.toString())) { + finalQueryUrn = "NONE"; + } else { + finalQueryUrn = queryUrn.toString(); + } + + ObjectNode fineGrainedLineageNode = instance.objectNode(); + fineGrainedLineageNode.put("confidenceScore", instance.numberNode(finalConfidenceScore)); + pathValues.add( + ImmutableTriple.of( + PatchOperationType.ADD.getValue(), + FINE_GRAINED_PATH_START + + transformationOperation + + "/" + + encodeValueUrn(downstreamSchemaField) + + "/" + + finalQueryUrn + + "/" + + encodeValueUrn(upstreamSchemaField), + fineGrainedLineageNode)); + + return this; + } + + private Float getConfidenceScoreOrDefault(@Nullable Float confidenceScore) { + float finalConfidenceScore; + if (confidenceScore != null && confidenceScore > 0 && confidenceScore <= 1.0f) { + finalConfidenceScore = confidenceScore; + } else { + finalConfidenceScore = 1.0f; + } + + return finalConfidenceScore; + } + + /** + * Removes a field as a fine grained upstream + * + * @param upstreamSchemaField a schema field to be marked as upstream, format: + * urn:li:schemaField(DATASET_URN, COLUMN NAME) + * @param transformationOperation string operation type that describes the transformation + * operation happening in the lineage edge + * @param downstreamSchemaField the downstream schema field this upstream is derived from, format: + * urn:li:schemaField(DATASET_URN, COLUMN NAME) + * @param queryUrn query urn the relationship is derived from + * @return this builder + */ + public DataJobInputOutputPatchBuilder removeFineGrainedUpstreamField( + @Nonnull Urn upstreamSchemaField, + @Nonnull String transformationOperation, + @Nonnull Urn downstreamSchemaField, + @Nullable Urn queryUrn) { + + String finalQueryUrn; + if (queryUrn == null || StringUtils.isBlank(queryUrn.toString())) { + finalQueryUrn = "NONE"; + } else { + finalQueryUrn = queryUrn.toString(); + } + pathValues.add( + ImmutableTriple.of( + PatchOperationType.REMOVE.getValue(), + FINE_GRAINED_PATH_START + + transformationOperation + + "/" + + encodeValueUrn(downstreamSchemaField) + + "/" + + finalQueryUrn + + "/" + + encodeValueUrn(upstreamSchemaField), + null)); + + return this; + } + public DataJobInputOutputPatchBuilder removeEdge( @Nonnull Edge edge, @Nonnull LineageDirection direction) { String path = getEdgePath(edge, direction); diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java index 08182761aeb03f..d0a46a35d51820 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java @@ -142,7 +142,7 @@ public UpstreamLineagePatchBuilder removeFineGrainedUpstreamField( FINE_GRAINED_PATH_START + transformationOperation + "/" - + downstreamSchemaField + + encodeValueUrn(downstreamSchemaField) + "/" + finalQueryUrn + "/" diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/FineGrainedLineageTemplateHelper.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/FineGrainedLineageTemplateHelper.java new file mode 100644 index 00000000000000..1f6a58c52ba248 --- /dev/null +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/FineGrainedLineageTemplateHelper.java @@ -0,0 +1,282 @@ +package com.linkedin.metadata.aspect.patch.template; + +import static com.fasterxml.jackson.databind.node.JsonNodeFactory.*; +import static com.linkedin.metadata.Constants.*; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.common.collect.Streams; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.Collectors; +import javax.annotation.Nullable; +import org.codehaus.plexus.util.StringUtils; + +public class FineGrainedLineageTemplateHelper { + + private static final String FINE_GRAINED_UPSTREAM_TYPE = "upstreamType"; + private static final String FINE_GRAINED_UPSTREAMS = "upstreams"; + private static final String FINE_GRAINED_DOWNSTREAM_TYPE = "downstreamType"; + private static final String FINE_GRAINED_DOWNSTREAMS = "downstreams"; + private static final String FINE_GRAINED_TRANSFORMATION_OPERATION = "transformOperation"; + private static final String FINE_GRAINED_CONFIDENCE_SCORE = "confidenceScore"; + private static final String FINE_GRAINED_QUERY_ID = "query"; + + // Template support + private static final String NONE_TRANSFORMATION_TYPE = "NONE"; + private static final Float DEFAULT_CONFIDENCE_SCORE = 1.0f; + private static final String DEFAULT_QUERY_ID = "NONE"; + + /** + * Combines fine grained lineage array into a map using upstream and downstream types as keys, + * defaulting when not present. Due to this construction, patches will look like: path: + * /fineGrainedLineages/TRANSFORMATION_OPERATION/DOWNSTREAM_FIELD_URN/QUERY_ID/UPSTREAM_FIELD_URN, + * op: ADD/REMOVE, value: float (confidenceScore) Due to the way FineGrainedLineage was designed + * it doesn't necessarily have a consistent key we can reference, so this specialized method + * mimics the arrayFieldToMap of the super class with the specialization that it does not put the + * full value of the aspect at the end of the key, just the particular array. This prevents + * unintended overwrites through improper MCP construction that is technically allowed by the + * schema when combining under fields that form the natural key. + * + * @param fineGrainedLineages the fine grained lineage array node + * @return the modified {@link JsonNode} with array fields transformed to maps + */ + public static JsonNode combineAndTransformFineGrainedLineages( + @Nullable JsonNode fineGrainedLineages) { + ObjectNode mapNode = instance.objectNode(); + if (!(fineGrainedLineages instanceof ArrayNode) || fineGrainedLineages.isEmpty()) { + return mapNode; + } + JsonNode lineageCopy = fineGrainedLineages.deepCopy(); + + lineageCopy + .elements() + .forEachRemaining( + node -> { + JsonNode nodeClone = node.deepCopy(); + String transformationOperation = + nodeClone.has(FINE_GRAINED_TRANSFORMATION_OPERATION) + ? nodeClone.get(FINE_GRAINED_TRANSFORMATION_OPERATION).asText() + : NONE_TRANSFORMATION_TYPE; + + if (!mapNode.has(transformationOperation)) { + mapNode.set(transformationOperation, instance.objectNode()); + } + ObjectNode transformationOperationNode = + (ObjectNode) mapNode.get(transformationOperation); + + ArrayNode downstreams = + nodeClone.has(FINE_GRAINED_DOWNSTREAMS) + ? (ArrayNode) nodeClone.get(FINE_GRAINED_DOWNSTREAMS) + : null; + + if (downstreams == null || downstreams.size() != 1) { + throw new UnsupportedOperationException( + "Patching not supported on fine grained lineages with not" + + " exactly one downstream. Current fine grained lineage implementation is downstream derived and " + + "patches are keyed on the root of this derivation."); + } + + Float confidenceScore = + nodeClone.has(FINE_GRAINED_CONFIDENCE_SCORE) + ? nodeClone.get(FINE_GRAINED_CONFIDENCE_SCORE).floatValue() + : DEFAULT_CONFIDENCE_SCORE; + + String upstreamType = + nodeClone.has(FINE_GRAINED_UPSTREAM_TYPE) + ? nodeClone.get(FINE_GRAINED_UPSTREAM_TYPE).asText() + : null; + String downstreamType = + nodeClone.has(FINE_GRAINED_DOWNSTREAM_TYPE) + ? nodeClone.get(FINE_GRAINED_DOWNSTREAM_TYPE).asText() + : null; + ArrayNode upstreams = + nodeClone.has(FINE_GRAINED_UPSTREAMS) + ? (ArrayNode) nodeClone.get(FINE_GRAINED_UPSTREAMS) + : null; + + String queryId = + nodeClone.has(FINE_GRAINED_QUERY_ID) + ? nodeClone.get(FINE_GRAINED_QUERY_ID).asText() + : DEFAULT_QUERY_ID; + + if (upstreamType == null) { + // Determine default type + Urn upstreamUrn = + upstreams != null ? UrnUtils.getUrn(upstreams.get(0).asText()) : null; + if (upstreamUrn != null + && DATASET_ENTITY_NAME.equals(upstreamUrn.getEntityType())) { + upstreamType = FINE_GRAINED_LINEAGE_DATASET_TYPE; + } else { + upstreamType = FINE_GRAINED_LINEAGE_FIELD_SET_TYPE; + } + } + + if (downstreamType == null) { + // Always use FIELD type, only support patches for single field downstream + downstreamType = FINE_GRAINED_LINEAGE_FIELD_TYPE; + } + + String downstreamRoot = downstreams.get(0).asText(); + if (!transformationOperationNode.has(downstreamRoot)) { + transformationOperationNode.set(downstreamRoot, instance.objectNode()); + } + ObjectNode downstreamRootNode = + (ObjectNode) transformationOperationNode.get(downstreamRoot); + if (!downstreamRootNode.has(queryId)) { + downstreamRootNode.set(queryId, instance.objectNode()); + } + ObjectNode queryNode = (ObjectNode) downstreamRootNode.get(queryId); + if (upstreams != null) { + addUrnsToParent( + queryNode, upstreams, confidenceScore, upstreamType, downstreamType); + } + }); + return mapNode; + } + + private static void addUrnsToParent( + JsonNode parentNode, + ArrayNode urnsList, + Float confidenceScore, + String upstreamType, + String downstreamType) { + // Will overwrite repeat urns with different confidence scores with the most recently seen + ((ObjectNode) parentNode) + .setAll( + Streams.stream(urnsList.elements()) + .map(JsonNode::asText) + .distinct() + .collect( + Collectors.toMap( + urn -> urn, + urn -> + mapToLineageValueNode(confidenceScore, upstreamType, downstreamType)))); + } + + private static JsonNode mapToLineageValueNode( + Float confidenceScore, String upstreamType, String downstreamType) { + ObjectNode objectNode = instance.objectNode(); + objectNode.set(FINE_GRAINED_CONFIDENCE_SCORE, instance.numberNode(confidenceScore)); + objectNode.set(FINE_GRAINED_UPSTREAM_TYPE, instance.textNode(upstreamType)); + objectNode.set(FINE_GRAINED_DOWNSTREAM_TYPE, instance.textNode(downstreamType)); + return objectNode; + } + + /** + * Takes the transformed fine grained lineages map from pre-processing and reconstructs an array + * of FineGrainedLineages Avoids producing side effects by copying nodes, use resulting node and + * not the original + * + * @param transformedFineGrainedLineages the transformed fine grained lineage map + * @return the modified {@link JsonNode} formatted consistent with the original schema + */ + public static ArrayNode reconstructFineGrainedLineages(JsonNode transformedFineGrainedLineages) { + if (transformedFineGrainedLineages instanceof ArrayNode) { + // We already have an ArrayNode, no need to transform. This happens during `replace` + // operations + return (ArrayNode) transformedFineGrainedLineages; + } + ObjectNode mapNode = (ObjectNode) transformedFineGrainedLineages; + ArrayNode fineGrainedLineages = instance.arrayNode(); + + mapNode + .fieldNames() + .forEachRemaining( + transformationOperation -> { + final ObjectNode transformationOperationNode = + (ObjectNode) mapNode.get(transformationOperation); + transformationOperationNode + .fieldNames() + .forEachRemaining( + downstreamName -> { + final ObjectNode downstreamNode = + (ObjectNode) transformationOperationNode.get(downstreamName); + downstreamNode + .fieldNames() + .forEachRemaining( + queryId -> + buildFineGrainedLineage( + downstreamName, + downstreamNode, + queryId, + transformationOperation, + fineGrainedLineages)); + }); + }); + + return fineGrainedLineages; + } + + private static void buildFineGrainedLineage( + final String downstreamName, + final ObjectNode downstreamNode, + final String queryId, + final String transformationOperation, + final ArrayNode fineGrainedLineages) { + final ObjectNode fineGrainedLineage = instance.objectNode(); + final ObjectNode queryNode = (ObjectNode) downstreamNode.get(queryId); + if (queryNode.isEmpty()) { + // Short circuit if no upstreams left + return; + } + ArrayNode downstream = instance.arrayNode(); + downstream.add(instance.textNode(downstreamName)); + // Set defaults, if found in sub nodes override, for confidenceScore take lowest + AtomicReference minimumConfidenceScore = new AtomicReference<>(DEFAULT_CONFIDENCE_SCORE); + AtomicReference upstreamType = + new AtomicReference<>(FINE_GRAINED_LINEAGE_FIELD_SET_TYPE); + AtomicReference downstreamType = new AtomicReference<>(FINE_GRAINED_LINEAGE_FIELD_TYPE); + ArrayNode upstreams = instance.arrayNode(); + queryNode + .fieldNames() + .forEachRemaining( + upstream -> + processUpstream( + queryNode, + upstream, + minimumConfidenceScore, + upstreamType, + downstreamType, + upstreams)); + fineGrainedLineage.set(FINE_GRAINED_DOWNSTREAMS, downstream); + fineGrainedLineage.set(FINE_GRAINED_UPSTREAMS, upstreams); + if (StringUtils.isNotBlank(queryId) && !DEFAULT_QUERY_ID.equals(queryId)) { + fineGrainedLineage.set(FINE_GRAINED_QUERY_ID, instance.textNode(queryId)); + } + fineGrainedLineage.set(FINE_GRAINED_UPSTREAM_TYPE, instance.textNode(upstreamType.get())); + fineGrainedLineage.set(FINE_GRAINED_DOWNSTREAM_TYPE, instance.textNode(downstreamType.get())); + fineGrainedLineage.set( + FINE_GRAINED_CONFIDENCE_SCORE, instance.numberNode(minimumConfidenceScore.get())); + fineGrainedLineage.set( + FINE_GRAINED_TRANSFORMATION_OPERATION, instance.textNode(transformationOperation)); + fineGrainedLineages.add(fineGrainedLineage); + } + + private static void processUpstream( + final ObjectNode queryNode, + final String upstream, + final AtomicReference minimumConfidenceScore, + final AtomicReference upstreamType, + final AtomicReference downstreamType, + final ArrayNode upstreams) { + final ObjectNode upstreamNode = (ObjectNode) queryNode.get(upstream); + if (upstreamNode.has(FINE_GRAINED_CONFIDENCE_SCORE)) { + Float scoreValue = upstreamNode.get(FINE_GRAINED_CONFIDENCE_SCORE).floatValue(); + if (scoreValue <= minimumConfidenceScore.get()) { + minimumConfidenceScore.set(scoreValue); + } + } + // Set types to last encountered, should never change, but this at least tries to support + // other types being specified. + if (upstreamNode.has(FINE_GRAINED_UPSTREAM_TYPE)) { + upstreamType.set(upstreamNode.get(FINE_GRAINED_UPSTREAM_TYPE).asText()); + } + if (upstreamNode.has(FINE_GRAINED_DOWNSTREAM_TYPE)) { + downstreamType.set(upstreamNode.get(FINE_GRAINED_DOWNSTREAM_TYPE).asText()); + } + upstreams.add(instance.textNode(upstream)); + } +} diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java index 2423e37e6d5419..23879ad1c2e353 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java @@ -84,7 +84,7 @@ public static JsonNode populateTopLevelKeys(JsonNode transformedNode, JsonPatch // Skip first as it will always be blank due to path starting with / for (int i = 1; i < endIdx; i++) { String decodedKey = decodeValue(keys[i]); - if (parent.get(keys[i]) == null) { + if (parent.get(decodedKey) == null) { ((ObjectNode) parent).set(decodedKey, instance.objectNode()); } parent = parent.get(decodedKey); diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/datajob/DataJobInputOutputTemplate.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/datajob/DataJobInputOutputTemplate.java index 3d398d97b50c38..ef26eed2f814f8 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/datajob/DataJobInputOutputTemplate.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/datajob/DataJobInputOutputTemplate.java @@ -1,6 +1,10 @@ package com.linkedin.metadata.aspect.patch.template.datajob; +import static com.fasterxml.jackson.databind.node.JsonNodeFactory.*; +import static com.linkedin.metadata.Constants.*; + import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; import com.linkedin.common.DataJobUrnArray; import com.linkedin.common.DatasetUrnArray; import com.linkedin.common.EdgeArray; @@ -9,6 +13,7 @@ import com.linkedin.datajob.DataJobInputOutput; import com.linkedin.dataset.FineGrainedLineageArray; import com.linkedin.metadata.aspect.patch.template.ArrayMergingTemplate; +import com.linkedin.metadata.aspect.patch.template.FineGrainedLineageTemplateHelper; import java.util.Collections; import javax.annotation.Nonnull; @@ -23,6 +28,8 @@ public class DataJobInputOutputTemplate implements ArrayMergingTemplate { @@ -27,18 +19,6 @@ public class UpstreamLineageTemplate extends CompoundKeyTemplate { - JsonNode nodeClone = node.deepCopy(); - String transformationOperation = - nodeClone.has(FINE_GRAINED_TRANSFORMATION_OPERATION) - ? nodeClone.get(FINE_GRAINED_TRANSFORMATION_OPERATION).asText() - : NONE_TRANSFORMATION_TYPE; - - if (!mapNode.has(transformationOperation)) { - mapNode.set(transformationOperation, instance.objectNode()); - } - ObjectNode transformationOperationNode = - (ObjectNode) mapNode.get(transformationOperation); - - ArrayNode downstreams = - nodeClone.has(FINE_GRAINED_DOWNSTREAMS) - ? (ArrayNode) nodeClone.get(FINE_GRAINED_DOWNSTREAMS) - : null; - - if (downstreams == null || downstreams.size() != 1) { - throw new UnsupportedOperationException( - "Patching not supported on fine grained lineages with not" - + " exactly one downstream. Current fine grained lineage implementation is downstream derived and " - + "patches are keyed on the root of this derivation."); - } - - Float confidenceScore = - nodeClone.has(FINE_GRAINED_CONFIDENCE_SCORE) - ? nodeClone.get(FINE_GRAINED_CONFIDENCE_SCORE).floatValue() - : DEFAULT_CONFIDENCE_SCORE; - - String upstreamType = - nodeClone.has(FINE_GRAINED_UPSTREAM_TYPE) - ? nodeClone.get(FINE_GRAINED_UPSTREAM_TYPE).asText() - : null; - String downstreamType = - nodeClone.has(FINE_GRAINED_DOWNSTREAM_TYPE) - ? nodeClone.get(FINE_GRAINED_DOWNSTREAM_TYPE).asText() - : null; - ArrayNode upstreams = - nodeClone.has(FINE_GRAINED_UPSTREAMS) - ? (ArrayNode) nodeClone.get(FINE_GRAINED_UPSTREAMS) - : null; - - String queryId = - nodeClone.has(FINE_GRAINED_QUERY_ID) - ? nodeClone.get(FINE_GRAINED_QUERY_ID).asText() - : DEFAULT_QUERY_ID; - - if (upstreamType == null) { - // Determine default type - Urn upstreamUrn = - upstreams != null ? UrnUtils.getUrn(upstreams.get(0).asText()) : null; - if (upstreamUrn != null - && DATASET_ENTITY_NAME.equals(upstreamUrn.getEntityType())) { - upstreamType = FINE_GRAINED_LINEAGE_DATASET_TYPE; - } else { - upstreamType = FINE_GRAINED_LINEAGE_FIELD_SET_TYPE; - } - } - - if (downstreamType == null) { - // Always use FIELD type, only support patches for single field downstream - downstreamType = FINE_GRAINED_LINEAGE_FIELD_TYPE; - } - - String downstreamRoot = downstreams.get(0).asText(); - if (!transformationOperationNode.has(downstreamRoot)) { - transformationOperationNode.set(downstreamRoot, instance.objectNode()); - } - ObjectNode downstreamRootNode = - (ObjectNode) transformationOperationNode.get(downstreamRoot); - if (!downstreamRootNode.has(queryId)) { - downstreamRootNode.set(queryId, instance.objectNode()); - } - ObjectNode queryNode = (ObjectNode) downstreamRootNode.get(queryId); - if (upstreams != null) { - addUrnsToParent( - queryNode, upstreams, confidenceScore, upstreamType, downstreamType); - } - }); - return mapNode; - } - - private void addUrnsToParent( - JsonNode parentNode, - ArrayNode urnsList, - Float confidenceScore, - String upstreamType, - String downstreamType) { - // Will overwrite repeat urns with different confidence scores with the most recently seen - ((ObjectNode) parentNode) - .setAll( - Streams.stream(urnsList.elements()) - .map(JsonNode::asText) - .distinct() - .collect( - Collectors.toMap( - urn -> urn, - urn -> - mapToLineageValueNode(confidenceScore, upstreamType, downstreamType)))); - } - - private JsonNode mapToLineageValueNode( - Float confidenceScore, String upstreamType, String downstreamType) { - ObjectNode objectNode = instance.objectNode(); - objectNode.set(FINE_GRAINED_CONFIDENCE_SCORE, instance.numberNode(confidenceScore)); - objectNode.set(FINE_GRAINED_UPSTREAM_TYPE, instance.textNode(upstreamType)); - objectNode.set(FINE_GRAINED_DOWNSTREAM_TYPE, instance.textNode(downstreamType)); - return objectNode; - } - - /** - * Takes the transformed fine grained lineages map from pre-processing and reconstructs an array - * of FineGrainedLineages Avoids producing side effects by copying nodes, use resulting node and - * not the original - * - * @param transformedFineGrainedLineages the transformed fine grained lineage map - * @return the modified {@link JsonNode} formatted consistent with the original schema - */ - private ArrayNode reconstructFineGrainedLineages(JsonNode transformedFineGrainedLineages) { - if (transformedFineGrainedLineages instanceof ArrayNode) { - // We already have an ArrayNode, no need to transform. This happens during `replace` - // operations - return (ArrayNode) transformedFineGrainedLineages; - } - ObjectNode mapNode = (ObjectNode) transformedFineGrainedLineages; - ArrayNode fineGrainedLineages = instance.arrayNode(); - - mapNode - .fieldNames() - .forEachRemaining( - transformationOperation -> { - final ObjectNode transformationOperationNode = - (ObjectNode) mapNode.get(transformationOperation); - transformationOperationNode - .fieldNames() - .forEachRemaining( - downstreamName -> { - final ObjectNode downstreamNode = - (ObjectNode) transformationOperationNode.get(downstreamName); - downstreamNode - .fieldNames() - .forEachRemaining( - queryId -> - buildFineGrainedLineage( - downstreamName, - downstreamNode, - queryId, - transformationOperation, - fineGrainedLineages)); - }); - }); - - return fineGrainedLineages; - } - - private void buildFineGrainedLineage( - final String downstreamName, - final ObjectNode downstreamNode, - final String queryId, - final String transformationOperation, - final ArrayNode fineGrainedLineages) { - final ObjectNode fineGrainedLineage = instance.objectNode(); - final ObjectNode queryNode = (ObjectNode) downstreamNode.get(queryId); - if (queryNode.isEmpty()) { - // Short circuit if no upstreams left - return; - } - ArrayNode downstream = instance.arrayNode(); - downstream.add(instance.textNode(downstreamName)); - // Set defaults, if found in sub nodes override, for confidenceScore take lowest - AtomicReference minimumConfidenceScore = new AtomicReference<>(DEFAULT_CONFIDENCE_SCORE); - AtomicReference upstreamType = - new AtomicReference<>(FINE_GRAINED_LINEAGE_FIELD_SET_TYPE); - AtomicReference downstreamType = new AtomicReference<>(FINE_GRAINED_LINEAGE_FIELD_TYPE); - ArrayNode upstreams = instance.arrayNode(); - queryNode - .fieldNames() - .forEachRemaining( - upstream -> - processUpstream( - queryNode, - upstream, - minimumConfidenceScore, - upstreamType, - downstreamType, - upstreams)); - fineGrainedLineage.set(FINE_GRAINED_DOWNSTREAMS, downstream); - fineGrainedLineage.set(FINE_GRAINED_UPSTREAMS, upstreams); - if (StringUtils.isNotBlank(queryId) && !DEFAULT_QUERY_ID.equals(queryId)) { - fineGrainedLineage.set(FINE_GRAINED_QUERY_ID, instance.textNode(queryId)); - } - fineGrainedLineage.set(FINE_GRAINED_UPSTREAM_TYPE, instance.textNode(upstreamType.get())); - fineGrainedLineage.set(FINE_GRAINED_DOWNSTREAM_TYPE, instance.textNode(downstreamType.get())); - fineGrainedLineage.set( - FINE_GRAINED_CONFIDENCE_SCORE, instance.numberNode(minimumConfidenceScore.get())); - fineGrainedLineage.set( - FINE_GRAINED_TRANSFORMATION_OPERATION, instance.textNode(transformationOperation)); - fineGrainedLineages.add(fineGrainedLineage); - } - - private void processUpstream( - final ObjectNode queryNode, - final String upstream, - final AtomicReference minimumConfidenceScore, - final AtomicReference upstreamType, - final AtomicReference downstreamType, - final ArrayNode upstreams) { - final ObjectNode upstreamNode = (ObjectNode) queryNode.get(upstream); - if (upstreamNode.has(FINE_GRAINED_CONFIDENCE_SCORE)) { - Float scoreValue = upstreamNode.get(FINE_GRAINED_CONFIDENCE_SCORE).floatValue(); - if (scoreValue <= minimumConfidenceScore.get()) { - minimumConfidenceScore.set(scoreValue); - } - } - // Set types to last encountered, should never change, but this at least tries to support - // other types being specified. - if (upstreamNode.has(FINE_GRAINED_UPSTREAM_TYPE)) { - upstreamType.set(upstreamNode.get(FINE_GRAINED_UPSTREAM_TYPE).asText()); - } - if (upstreamNode.has(FINE_GRAINED_DOWNSTREAM_TYPE)) { - downstreamType.set(upstreamNode.get(FINE_GRAINED_DOWNSTREAM_TYPE).asText()); - } - upstreams.add(instance.textNode(upstream)); - } } diff --git a/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/DataJobInputOutputTemplateTest.java b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/DataJobInputOutputTemplateTest.java new file mode 100644 index 00000000000000..d2a26221a3bb9f --- /dev/null +++ b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/DataJobInputOutputTemplateTest.java @@ -0,0 +1,255 @@ +package com.linkedin.metadata.aspect.patch.template; + +import static com.linkedin.metadata.utils.GenericRecordUtils.*; +import static org.testng.Assert.*; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.linkedin.common.UrnArray; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.DataMap; +import com.linkedin.datajob.DataJobInputOutput; +import com.linkedin.dataset.FineGrainedLineage; +import com.linkedin.dataset.FineGrainedLineageDownstreamType; +import com.linkedin.dataset.FineGrainedLineageUpstreamType; +import com.linkedin.metadata.aspect.patch.template.datajob.DataJobInputOutputTemplate; +import jakarta.json.Json; +import jakarta.json.JsonObjectBuilder; +import jakarta.json.JsonPatch; +import jakarta.json.JsonPatchBuilder; +import jakarta.json.JsonValue; +import org.testng.annotations.Test; + +public class DataJobInputOutputTemplateTest { + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @Test + public void testPatchUpstream() throws Exception { + DataJobInputOutputTemplate dataJobInputOutputTemplate = new DataJobInputOutputTemplate(); + DataJobInputOutput dataJobInputOutput = dataJobInputOutputTemplate.getDefault(); + JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder(); + + JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder(); + JsonValue upstreamConfidenceScore = Json.createValue(1.0f); + fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore); + jsonPatchBuilder.add( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)//urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c1)", + fineGrainedLineageNode.build()); + + // Initial population test + DataJobInputOutput result = + dataJobInputOutputTemplate.applyPatch(dataJobInputOutput, jsonPatchBuilder.build()); + // Hack because Jackson parses values to doubles instead of floats + DataMap dataMap = new DataMap(); + dataMap.put("confidenceScore", 1.0); + FineGrainedLineage fineGrainedLineage = new FineGrainedLineage(dataMap); + UrnArray urns = new UrnArray(); + Urn urn1 = + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"); + urns.add(urn1); + UrnArray upstreams = new UrnArray(); + Urn upstreamUrn = + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c1)"); + upstreams.add(upstreamUrn); + fineGrainedLineage.setDownstreams(urns); + fineGrainedLineage.setUpstreams(upstreams); + fineGrainedLineage.setTransformOperation("CREATE"); + fineGrainedLineage.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET); + fineGrainedLineage.setDownstreamType(FineGrainedLineageDownstreamType.FIELD); + assertEquals(result.getFineGrainedLineages().get(0), fineGrainedLineage); + + // Test non-overwrite upstreams and correct confidence score and types w/ overwrite + JsonObjectBuilder finegrainedLineageNode2 = Json.createObjectBuilder(); + finegrainedLineageNode2.add( + "upstreamType", Json.createValue(FineGrainedLineageUpstreamType.FIELD_SET.name())); + finegrainedLineageNode2.add("confidenceScore", upstreamConfidenceScore); + finegrainedLineageNode2.add( + "downstreamType", Json.createValue(FineGrainedLineageDownstreamType.FIELD.name())); + + JsonPatchBuilder patchOperations2 = Json.createPatchBuilder(); + patchOperations2.add( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:someQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)", + finegrainedLineageNode2.build()); + + JsonValue upstreamConfidenceScore2 = Json.createValue(0.1f); + JsonObjectBuilder finegrainedLineageNode3 = Json.createObjectBuilder(); + finegrainedLineageNode3.add( + "upstreamType", Json.createValue(FineGrainedLineageUpstreamType.DATASET.name())); + finegrainedLineageNode3.add("confidenceScore", upstreamConfidenceScore2); + finegrainedLineageNode3.add( + "downstreamType", Json.createValue(FineGrainedLineageDownstreamType.FIELD_SET.name())); + + patchOperations2.add( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:someQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)", + finegrainedLineageNode3.build()); + + JsonPatch jsonPatch2 = patchOperations2.build(); + + DataJobInputOutput result2 = dataJobInputOutputTemplate.applyPatch(result, jsonPatch2); + // Hack because Jackson parses values to doubles instead of floats + DataMap dataMap2 = new DataMap(); + dataMap2.put("confidenceScore", 0.1); + FineGrainedLineage fineGrainedLineage2 = new FineGrainedLineage(dataMap2); + UrnArray urns2 = new UrnArray(); + Urn urn2 = + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"); + urns2.add(urn2); + Urn downstreamUrn2 = + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)"); + UrnArray downstreams2 = new UrnArray(); + downstreams2.add(downstreamUrn2); + fineGrainedLineage2.setUpstreams(urns2); + fineGrainedLineage2.setDownstreams(downstreams2); + fineGrainedLineage2.setTransformOperation("CREATE"); + fineGrainedLineage2.setUpstreamType(FineGrainedLineageUpstreamType.DATASET); + fineGrainedLineage2.setDownstreamType(FineGrainedLineageDownstreamType.FIELD_SET); + fineGrainedLineage2.setQuery(UrnUtils.getUrn("urn:li:query:someQuery")); + assertEquals(result2.getFineGrainedLineages().get(1), fineGrainedLineage2); + + // Check different queries + JsonObjectBuilder finegrainedLineageNode4 = Json.createObjectBuilder(); + finegrainedLineageNode4.add( + "upstreamType", Json.createValue(FineGrainedLineageUpstreamType.FIELD_SET.name())); + finegrainedLineageNode4.add("confidenceScore", upstreamConfidenceScore); + finegrainedLineageNode4.add( + "downstreamType", Json.createValue(FineGrainedLineageDownstreamType.FIELD.name())); + + JsonPatchBuilder patchOperations3 = Json.createPatchBuilder(); + patchOperations3.add( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)", + finegrainedLineageNode4.build()); + + JsonPatch jsonPatch3 = patchOperations3.build(); + DataJobInputOutput result3 = dataJobInputOutputTemplate.applyPatch(result2, jsonPatch3); + // Hack because Jackson parses values to doubles instead of floats + DataMap dataMap3 = new DataMap(); + dataMap3.put("confidenceScore", 1.0); + FineGrainedLineage fineGrainedLineage3 = new FineGrainedLineage(dataMap3); + UrnArray urns3 = new UrnArray(); + Urn urn3 = + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)"); + urns3.add(urn3); + + Urn upstreamUrn3 = + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"); + UrnArray upstreamUrns3 = new UrnArray(); + upstreamUrns3.add(upstreamUrn3); + fineGrainedLineage3.setDownstreams(urns3); + fineGrainedLineage3.setUpstreams(upstreamUrns3); + fineGrainedLineage3.setTransformOperation("CREATE"); + fineGrainedLineage3.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET); + fineGrainedLineage3.setDownstreamType(FineGrainedLineageDownstreamType.FIELD); + fineGrainedLineage3.setQuery(UrnUtils.getUrn("urn:li:query:anotherQuery")); + // Splits into two for different types + assertEquals(result3.getFineGrainedLineages().get(2), fineGrainedLineage3); + + // Check different transform types + JsonObjectBuilder finegrainedLineageNode5 = Json.createObjectBuilder(); + finegrainedLineageNode5.add( + "upstreamType", Json.createValue(FineGrainedLineageUpstreamType.FIELD_SET.name())); + finegrainedLineageNode5.add("confidenceScore", upstreamConfidenceScore); + finegrainedLineageNode5.add( + "downstreamType", Json.createValue(FineGrainedLineageDownstreamType.FIELD.name())); + + JsonPatchBuilder patchOperations4 = Json.createPatchBuilder(); + patchOperations4.add( + "/fineGrainedLineages/TRANSFORM/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)", + finegrainedLineageNode5.build()); + JsonPatch jsonPatch4 = patchOperations4.build(); + + DataJobInputOutput result4 = dataJobInputOutputTemplate.applyPatch(result3, jsonPatch4); + // Hack because Jackson parses values to doubles instead of floats + DataMap dataMap4 = new DataMap(); + dataMap4.put("confidenceScore", 1.0); + FineGrainedLineage fineGrainedLineage4 = new FineGrainedLineage(dataMap4); + fineGrainedLineage4.setUpstreams(upstreamUrns3); + fineGrainedLineage4.setDownstreams(urns3); + fineGrainedLineage4.setTransformOperation("TRANSFORM"); + fineGrainedLineage4.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET); + fineGrainedLineage4.setDownstreamType(FineGrainedLineageDownstreamType.FIELD); + fineGrainedLineage4.setQuery(UrnUtils.getUrn("urn:li:query:anotherQuery")); + // New entry in array because of new transformation type + assertEquals(result4.getFineGrainedLineages().get(3), fineGrainedLineage4); + + // Remove + JsonPatchBuilder removeOperations = Json.createPatchBuilder(); + removeOperations.remove( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)/NONE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c1)"); + removeOperations.remove( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:someQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"); + removeOperations.remove( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"); + removeOperations.remove( + "/fineGrainedLineages/TRANSFORM/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"); + + JsonPatch removePatch = removeOperations.build(); + DataJobInputOutput finalResult = dataJobInputOutputTemplate.applyPatch(result4, removePatch); + assertEquals(finalResult, dataJobInputOutputTemplate.getDefault()); + } + + @Test + public void testPatchWithFieldWithForwardSlash() throws JsonProcessingException { + + String downstreamUrn = + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"; + String unescapedUpstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),slash/column)"; + String escapedUpstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),slash~1column)"; + String lineagePath = downstreamUrn + "//" + escapedUpstreamUrn; + + DataJobInputOutputTemplate dataJobInputOutputTemplate = new DataJobInputOutputTemplate(); + DataJobInputOutput dataJobInputOutput = dataJobInputOutputTemplate.getDefault(); + JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder(); + + JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder(); + JsonValue upstreamConfidenceScore = Json.createValue(1.0f); + fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore); + + jsonPatchBuilder.add(lineagePath, fineGrainedLineageNode.build()); + + // Initial population test + DataJobInputOutput result = + dataJobInputOutputTemplate.applyPatch(dataJobInputOutput, jsonPatchBuilder.build()); + + assertEquals( + result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(), + unescapedUpstreamUrn); + } + + @Test + public void testPatchWithFieldWithTilde() throws JsonProcessingException { + + String downstreamUrn = + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"; + String unescapedUpstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),tilde~column)"; + String escapedUpstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),tilde~0column)"; + String lineagePath = downstreamUrn + "//" + escapedUpstreamUrn; + + DataJobInputOutputTemplate dataJobInputOutputTemplate = new DataJobInputOutputTemplate(); + DataJobInputOutput dataJobInputOutput = dataJobInputOutputTemplate.getDefault(); + JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder(); + + JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder(); + JsonValue upstreamConfidenceScore = Json.createValue(1.0f); + fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore); + + jsonPatchBuilder.add(lineagePath, fineGrainedLineageNode.build()); + + // Initial population test + DataJobInputOutput result = + dataJobInputOutputTemplate.applyPatch(dataJobInputOutput, jsonPatchBuilder.build()); + assertEquals( + result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(), + unescapedUpstreamUrn); + } +} diff --git a/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java index f934dd8961ca37..ab0e7f960251c4 100644 --- a/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java +++ b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java @@ -221,6 +221,7 @@ public void testPatchUpstream() throws Exception { JsonPatch removePatch = removeOperations.build(); UpstreamLineage finalResult = upstreamLineageTemplate.applyPatch(result4, removePatch); + assertEquals(finalResult, upstreamLineageTemplate.getDefault()); } @@ -337,4 +338,39 @@ public void testPatchWithFieldWithTilde() throws JsonProcessingException { result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(), unescapedUpstreamUrn); } + + @Test + public void testPatchRemoveWithFields() throws JsonProcessingException { + + String downstreamUrn = + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,~1tmp~1test.parquet,PROD),c1)"; + String upstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c1)"; + String upstreamUrn2 = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)"; + + String lineagePath1 = downstreamUrn + "/NONE/" + upstreamUrn; + String lineagePath2 = downstreamUrn + "/NONE/" + upstreamUrn2; + + UpstreamLineageTemplate upstreamLineageTemplate = new UpstreamLineageTemplate(); + UpstreamLineage upstreamLineage = upstreamLineageTemplate.getDefault(); + JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder(); + + JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder(); + JsonValue upstreamConfidenceScore = Json.createValue(1.0f); + fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore); + + jsonPatchBuilder.add(lineagePath1, fineGrainedLineageNode.build()); + jsonPatchBuilder.add(lineagePath2, fineGrainedLineageNode.build()); + + // Initial population test + UpstreamLineage result = + upstreamLineageTemplate.applyPatch(upstreamLineage, jsonPatchBuilder.build()); + assertEquals( + result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(), upstreamUrn); + assertEquals( + result.getFineGrainedLineages().get(0).getUpstreams().get(1).toString(), upstreamUrn2); + + assertEquals(result.getFineGrainedLineages().get(0).getUpstreams().size(), 2); + } } From 8f9659fadf8f0fcc51470cd77561a03bbe7baa9b Mon Sep 17 00:00:00 2001 From: Austin SeungJun Park <110667795+eagle-25@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:57:03 +0900 Subject: [PATCH 10/21] fix(ingest/s3): incorrectly parsing path in s3_uri (#12135) --- metadata-ingestion/src/datahub/ingestion/source/s3/source.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 1863663f98bb24..3ddf47b70cdf80 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -9,6 +9,7 @@ from itertools import groupby from pathlib import PurePath from typing import Any, Dict, Iterable, List, Optional, Tuple +from urllib.parse import urlparse import smart_open.compression as so_compression from more_itertools import peekable @@ -993,9 +994,7 @@ def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePa folders = [] for dir in dirs_to_process: logger.info(f"Getting files from folder: {dir}") - prefix_to_process = dir.rstrip("\\").lstrip( - self.create_s3_path(bucket_name, "/") - ) + prefix_to_process = urlparse(dir).path.lstrip("/") folders.extend( self.get_folder_info( From d2359e259aa0f09506bfe68893abbda92d30601d Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 17 Dec 2024 03:58:47 -0500 Subject: [PATCH 11/21] feat(ingest/datahub): report progress on db ingestion (#12117) --- .../datahub/ingestion/source/datahub/datahub_source.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py index 63cea45f75864b..cb72441344088c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py @@ -1,5 +1,5 @@ import logging -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from functools import partial from typing import Dict, Iterable, List, Optional @@ -26,6 +26,7 @@ StatefulIngestionSourceBase, ) from datahub.metadata.schema_classes import ChangeTypeClass +from datahub.utilities.progress_timer import ProgressTimer logger = logging.getLogger(__name__) @@ -105,11 +106,17 @@ def _get_database_workunits( self, from_createdon: datetime, reader: DataHubDatabaseReader ) -> Iterable[MetadataWorkUnit]: logger.info(f"Fetching database aspects starting from {from_createdon}") + progress = ProgressTimer(report_every=timedelta(seconds=60)) mcps = reader.get_aspects(from_createdon, self.report.stop_time) for i, (mcp, createdon) in enumerate(mcps): if not self.urn_pattern.allowed(str(mcp.entityUrn)): continue + if progress.should_report(): + logger.info( + f"Ingested {i} database aspects so far, currently at {createdon}" + ) + yield mcp.as_workunit() self.report.num_database_aspects_ingested += 1 From ff385edbb1b6f0bb6de5f55cb6b30d8db9d1f13c Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 17 Dec 2024 03:49:47 -0800 Subject: [PATCH 12/21] build(ingest/sqlglot): Bump pin to support snowflake CREATE ... WITH TAG (#12003) --- metadata-ingestion/setup.py | 2 +- .../src/datahub/testing/compare_metadata_json.py | 2 +- .../sql_parsing/aggregator_goldens/test_table_rename.json | 2 +- .../unit/sql_parsing/aggregator_goldens/test_table_swap.json | 4 ++-- .../aggregator_goldens/test_table_swap_with_temp.json | 2 +- .../goldens/test_bigquery_information_schema_query.json | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 415871d30175f8..31db711592eb14 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -101,7 +101,7 @@ # We heavily monkeypatch sqlglot. # Prior to the patching, we originally maintained an acryl-sqlglot fork: # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:main?expand=1 - "sqlglot[rs]==25.26.0", + "sqlglot[rs]==25.32.1", "patchy==2.8.0", } diff --git a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py index bedc5bc8fcd5e5..9dbadd4804997d 100644 --- a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py +++ b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py @@ -117,7 +117,7 @@ def diff_metadata_json( ignore_paths: Sequence[str] = (), ignore_order: bool = True, ) -> Union[DeepDiff, MCPDiff]: - ignore_paths = (*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info") + ignore_paths = [*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info"] try: if ignore_order: golden_map = get_aspects_by_urn(golden) diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json index 2d32e1328fbb4f..fd8475090f009e 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json @@ -185,7 +185,7 @@ "aspect": { "json": { "statement": { - "value": "ALTER TABLE dev.public.foo_staging RENAME TO foo", + "value": "ALTER TABLE dev.public.foo_staging RENAME TO foo /* Datahub generated query text-- */", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json index af0fca485777ff..d9d46a4b14a146 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json @@ -185,7 +185,7 @@ "aspect": { "json": { "statement": { - "value": "ALTER TABLE dev.public.person_info_swap SWAP WITH dev.public.person_info", + "value": "ALTER TABLE dev.public.person_info_swap SWAP WITH dev.public.person_info /* Datahub generated query text-- */", "language": "SQL" }, "source": "SYSTEM", @@ -438,7 +438,7 @@ "aspect": { "json": { "statement": { - "value": "ALTER TABLE dev.public.person_info SWAP WITH dev.public.person_info_swap", + "value": "ALTER TABLE dev.public.person_info SWAP WITH dev.public.person_info_swap /* Datahub generated query text-- */", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json index ceaaf8f6887c7c..b4eaf76a149337 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json @@ -175,7 +175,7 @@ "aspect": { "json": { "statement": { - "value": "CREATE TABLE person_info_swap CLONE person_info;\n\nCREATE TABLE person_info_incremental AS\nSELECT\n *\nFROM person_info_dep;\n\nINSERT INTO person_info_swap\nSELECT\n *\nFROM person_info_incremental;\n\nALTER TABLE dev.public.person_info_swap SWAP WITH dev.public.person_info", + "value": "CREATE TABLE person_info_swap CLONE person_info;\n\nCREATE TABLE person_info_incremental AS\nSELECT\n *\nFROM person_info_dep;\n\nINSERT INTO person_info_swap\nSELECT\n *\nFROM person_info_incremental;\n\nALTER TABLE dev.public.person_info_swap SWAP WITH dev.public.person_info /* Datahub generated query text-- */", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_information_schema_query.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_information_schema_query.json index f5f573f3d51136..9621b7d1c265b4 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_information_schema_query.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_information_schema_query.json @@ -1,7 +1,7 @@ { "query_type": "SELECT", "query_type_props": {}, - "query_fingerprint": "c721ce16410601b36e5f32bd9c5c28488500a93e617363739faebfe71496f163", + "query_fingerprint": "a204522c98a01568d8575a98a715de98985aeef0e822feb8450153f71891d6c6", "in_tables": [ "urn:li:dataset:(urn:li:dataPlatform:bigquery,acryl-staging-2.smoke_test_db_4.INFORMATION_SCHEMA.COLUMNS,PROD)", "urn:li:dataset:(urn:li:dataPlatform:bigquery,acryl-staging-2.smoke_test_db_4.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS,PROD)" @@ -178,6 +178,6 @@ ], "debug_info": { "confidence": 0.2, - "generalized_statement": "SELECT c.table_catalog AS table_catalog, c.table_schema AS table_schema, c.table_name AS table_name, c.column_name AS column_name, c.ordinal_position AS ordinal_position, cfp.field_path AS field_path, c.is_nullable AS is_nullable, CASE WHEN CONTAINS_SUBSTR(cfp.field_path, ?) THEN NULL ELSE c.data_type END AS data_type, description AS comment, c.is_hidden AS is_hidden, c.is_partitioning_column AS is_partitioning_column, c.clustering_ordinal_position AS clustering_ordinal_position FROM `acryl-staging-2`.`smoke_test_db_4`.INFORMATION_SCHEMA.COLUMNS AS c JOIN `acryl-staging-2`.`smoke_test_db_4`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS AS cfp ON cfp.table_name = c.table_name AND cfp.column_name = c.column_name ORDER BY table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC" + "generalized_statement": "SELECT c.table_catalog AS table_catalog, c.table_schema AS table_schema, c.table_name AS table_name, c.column_name AS column_name, c.ordinal_position AS ordinal_position, cfp.field_path AS field_path, c.is_nullable AS is_nullable, CASE WHEN CONTAINS_SUBSTR(cfp.field_path, ?) THEN NULL ELSE c.data_type END AS data_type, description AS comment, c.is_hidden AS is_hidden, c.is_partitioning_column AS is_partitioning_column, c.clustering_ordinal_position AS clustering_ordinal_position FROM `acryl-staging-2`.`smoke_test_db_4`.`INFORMATION_SCHEMA.COLUMNS` AS c JOIN `acryl-staging-2`.`smoke_test_db_4`.`INFORMATION_SCHEMA.COLUMN_FIELD_PATHS` AS cfp ON cfp.table_name = c.table_name AND cfp.column_name = c.column_name ORDER BY table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC" } } \ No newline at end of file From 42cad3d5267386ad207740eb991b7a4a95c4f3e2 Mon Sep 17 00:00:00 2001 From: deepgarg-visa <149145061+deepgarg-visa@users.noreply.github.com> Date: Tue, 17 Dec 2024 21:53:10 +0530 Subject: [PATCH 13/21] fix(frontend): fix typo datahub-frontend logback.xml (#12134) --- datahub-frontend/conf/logback.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datahub-frontend/conf/logback.xml b/datahub-frontend/conf/logback.xml index 78da231b4a71c5..de37c56cba38a7 100644 --- a/datahub-frontend/conf/logback.xml +++ b/datahub-frontend/conf/logback.xml @@ -61,7 +61,7 @@ - + From d5ab001a97543535dbf82d8ff036a4092083111e Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 17 Dec 2024 13:54:37 -0500 Subject: [PATCH 14/21] feat(ingest/git): add subdir support to GitReference (#12131) Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: harshal.sheth@acryl.io --- .../src/datahub/configuration/git.py | 8 ++++- .../tests/integration/git/test_git_clone.py | 35 ++++++++++++++++--- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py index d237cd9ddd306c..e7e9bfd43adca5 100644 --- a/metadata-ingestion/src/datahub/configuration/git.py +++ b/metadata-ingestion/src/datahub/configuration/git.py @@ -24,7 +24,11 @@ class GitReference(ConfigModel): "main", description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.", ) - + url_subdir: Optional[str] = Field( + default=None, + description="Prefix to prepend when generating URLs for files - useful when files are in a subdirectory. " + "Only affects URL generation, not git operations.", + ) url_template: Optional[str] = Field( None, description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required." @@ -68,6 +72,8 @@ def infer_url_template(cls, url_template: Optional[str], values: dict) -> str: def get_url_for_file_path(self, file_path: str) -> str: assert self.url_template + if self.url_subdir: + file_path = f"{self.url_subdir}/{file_path}" return self.url_template.format( repo_url=self.repo, branch=self.branch, file_path=file_path ) diff --git a/metadata-ingestion/tests/integration/git/test_git_clone.py b/metadata-ingestion/tests/integration/git/test_git_clone.py index 60cf20fefcbdd1..01e075930998a4 100644 --- a/metadata-ingestion/tests/integration/git/test_git_clone.py +++ b/metadata-ingestion/tests/integration/git/test_git_clone.py @@ -1,4 +1,5 @@ import os +import pathlib import pytest from pydantic import SecretStr @@ -12,7 +13,7 @@ LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY") -def test_base_url_guessing(): +def test_base_url_guessing() -> None: # Basic GitHub repo. config = GitInfo(repo="https://github.com/datahub-project/datahub", branch="master") assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git" @@ -70,7 +71,7 @@ def test_base_url_guessing(): ) -def test_github_branch(): +def test_github_branch() -> None: config = GitInfo( repo="owner/repo", ) @@ -83,11 +84,37 @@ def test_github_branch(): assert config.branch_for_clone == "main" +def test_url_subdir() -> None: + git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="dbt") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://github.com/org/repo/blob/main/dbt/model.sql" + ) + + git_ref = GitReference(repo="https://gitlab.com/org/repo", url_subdir="dbt") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://gitlab.com/org/repo/-/blob/main/dbt/model.sql" + ) + + git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://github.com/org/repo/blob/main/model.sql" + ) + + git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="dbt/models") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://github.com/org/repo/blob/main/dbt/models/model.sql" + ) + + def test_sanitize_repo_url() -> None: assert_doctest(datahub.ingestion.source.git.git_import) -def test_git_clone_public(tmp_path): +def test_git_clone_public(tmp_path: pathlib.Path) -> None: git_clone = GitClone(str(tmp_path)) checkout_dir = git_clone.clone( ssh_key=None, @@ -107,7 +134,7 @@ def test_git_clone_public(tmp_path): LOOKML_TEST_SSH_KEY is None, reason="DATAHUB_LOOKML_GIT_TEST_SSH_KEY env variable is not configured", ) -def test_git_clone_private(tmp_path): +def test_git_clone_private(tmp_path: pathlib.Path) -> None: git_clone = GitClone(str(tmp_path)) secret_key = SecretStr(LOOKML_TEST_SSH_KEY) if LOOKML_TEST_SSH_KEY else None From ef1c1df8d0c03772f4132e5e37de67ed956cdbea Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Tue, 17 Dec 2024 15:56:16 -0500 Subject: [PATCH 15/21] fix(ui) Fix nesting logic in properties tab (#12151) --- .../__tests__/useStructuredProperties.test.ts | 87 +++++++++++++++++++ .../Properties/useStructuredProperties.tsx | 6 +- 2 files changed, 90 insertions(+), 3 deletions(-) create mode 100644 datahub-web-react/src/app/entity/shared/tabs/Properties/__tests__/useStructuredProperties.test.ts diff --git a/datahub-web-react/src/app/entity/shared/tabs/Properties/__tests__/useStructuredProperties.test.ts b/datahub-web-react/src/app/entity/shared/tabs/Properties/__tests__/useStructuredProperties.test.ts new file mode 100644 index 00000000000000..ff7c6e51a04a00 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/tabs/Properties/__tests__/useStructuredProperties.test.ts @@ -0,0 +1,87 @@ +import { identifyAndAddParentRows } from '../useStructuredProperties'; + +describe('identifyAndAddParentRows', () => { + it('should not return parent rows when there are none', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'test1' }, + { displayName: 'test2', qualifiedName: 'test2' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([]); + }); + + it('should not return parent rows when another row starts with the same letters but is a different token', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'testing.one' }, + { displayName: 'test2', qualifiedName: 'testingAgain.two' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([]); + }); + + it('should return parent rows properly', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'testing.one' }, + { displayName: 'test2', qualifiedName: 'testing.two' }, + { displayName: 'test3', qualifiedName: 'testing.three' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([ + { displayName: 'testing', qualifiedName: 'testing', childrenCount: 3 }, + ]); + }); + + it('should return parent rows properly with multiple layers of nesting', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'testing.one.two.a.1' }, + { displayName: 'test1', qualifiedName: 'testing.one.two.a.2' }, + { displayName: 'test1', qualifiedName: 'testing.one.two.b' }, + { displayName: 'test1', qualifiedName: 'testing.one.three' }, + { displayName: 'test2', qualifiedName: 'testing.two.c.d' }, + { displayName: 'test3', qualifiedName: 'testing.three' }, + { displayName: 'test3', qualifiedName: 'testParent' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([ + { displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 6 }, + { displayName: 'testing.one', qualifiedName: 'testing.one', isParentRow: true, childrenCount: 4 }, + { displayName: 'testing.one.two', qualifiedName: 'testing.one.two', isParentRow: true, childrenCount: 3 }, + { + displayName: 'testing.one.two.a', + qualifiedName: 'testing.one.two.a', + isParentRow: true, + childrenCount: 2, + }, + ]); + }); + + it('should return parent rows properly with multiple layers of nesting regardless of order', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'testing.one.two.a.1' }, + { displayName: 'test3', qualifiedName: 'testParent' }, + { displayName: 'test1', qualifiedName: 'testing.one.three' }, + { displayName: 'test2', qualifiedName: 'testing.two.c.d' }, + { displayName: 'test1', qualifiedName: 'testing.one.two.b' }, + { displayName: 'test3', qualifiedName: 'testing.three' }, + { displayName: 'test1', qualifiedName: 'testing.one.two.a.2' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([ + { displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 6 }, + { displayName: 'testing.one', qualifiedName: 'testing.one', isParentRow: true, childrenCount: 4 }, + { displayName: 'testing.one.two', qualifiedName: 'testing.one.two', isParentRow: true, childrenCount: 3 }, + { + displayName: 'testing.one.two.a', + qualifiedName: 'testing.one.two.a', + isParentRow: true, + childrenCount: 2, + }, + ]); + }); + + it('should return parent rows properly with simpler layers of nesting', () => { + const propertyRows = [ + { displayName: 'test2', qualifiedName: 'testing.two.c.d' }, + { displayName: 'test3', qualifiedName: 'testing.three' }, + { displayName: 'test3', qualifiedName: 'testParent' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([ + { displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 2 }, + ]); + }); +}); diff --git a/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx b/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx index 18ee6bb18da3d3..60d0aac30eb4ce 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx @@ -122,10 +122,10 @@ export function identifyAndAddParentRows(rows?: Array): Array name.startsWith(token)).length; + const currentCount = qualifiedNames.filter((name) => name.startsWith(`${token}.`)).length; - // If we're at the beginning of the path and there is no nesting, break - if (index === 0 && currentCount === 1) { + // If there's only one child, don't nest it + if (currentCount === 1) { break; } From 826437612e2526864dc82731111113341863cd5a Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 17 Dec 2024 23:21:05 -0500 Subject: [PATCH 16/21] fix(ingest/snowflake): improve lineage parse failure logging (#12153) --- .../ingestion/source/snowflake/snowflake_lineage_v2.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 93d84d8b246e51..c769c6705ac3f6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -414,9 +414,13 @@ def _process_upstream_lineage_row( except Exception as e: self.report.num_upstream_lineage_edge_parsing_failed += 1 upstream_tables = db_row.get("UPSTREAM_TABLES") + downstream_table = db_row.get("DOWNSTREAM_TABLE_NAME") self.structured_reporter.warning( "Failed to parse lineage edge", - context=f"Upstreams: {upstream_tables} Downstreams: {db_row.get('DOWNSTREAM_TABLE_NAME')}", + # Tricky: sometimes the full row data is too large, and so the context + # message gets truncated. By pulling out the upstreams and downstream + # list, we can at least get the important fields if truncation does occur. + context=f"Upstreams: {upstream_tables} Downstream: {downstream_table} Full row: {db_row}", exc=e, ) return None From 5946558c01c0b3f99effe8fd7fd11ba30c892a1f Mon Sep 17 00:00:00 2001 From: Alice-sky <1835063592@qq.com> Date: Wed, 18 Dec 2024 15:21:41 +0800 Subject: [PATCH 17/21] fix(ingest/pulsar): handle Avro schema with missing namespace or name (#12058) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Alice Co-authored-by: Shirshanka Das Co-authored-by: Sergio Gómez Villamor Co-authored-by: Harshal Sheth --- .../src/datahub/ingestion/source/pulsar.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py index 15ee995b2d5fdc..f71949b9eb27f7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py +++ b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py @@ -89,7 +89,16 @@ def __init__(self, schema): logger.error(f"Invalid JSON schema: {schema_data}. Error: {str(e)}") avro_schema = {} - self.schema_name = avro_schema.get("namespace") + "." + avro_schema.get("name") + self.schema_name = "null" + if avro_schema.get("namespace") and avro_schema.get("name"): + self.schema_name = ( + avro_schema.get("namespace") + "." + avro_schema.get("name") + ) + elif avro_schema.get("namespace"): + self.schema_name = avro_schema.get("namespace") + elif avro_schema.get("name"): + self.schema_name = avro_schema.get("name") + self.schema_description = avro_schema.get("doc") self.schema_type = schema.get("type") self.schema_str = schema.get("data") From 76cfac3700f261dd87d0c494235ea8c1635bd7ec Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 18 Dec 2024 04:04:51 -0500 Subject: [PATCH 18/21] fix(cli/properties): allow structured properties without a graph instance (#12144) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sergio Gómez Villamor --- .../structuredproperties.py | 245 +++++++++--------- .../cli/specific/structuredproperties_cli.py | 3 +- .../entities/structuredproperties/__init__.py | 0 .../example_structured_properties_golden.json | 194 ++++++++++++++ .../test_structuredproperties.py | 38 +++ .../tests/unit/serde/test_codegen.py | 7 + 6 files changed, 357 insertions(+), 130 deletions(-) create mode 100644 metadata-ingestion/tests/unit/api/entities/structuredproperties/__init__.py create mode 100644 metadata-ingestion/tests/unit/api/entities/structuredproperties/example_structured_properties_golden.json create mode 100644 metadata-ingestion/tests/unit/api/entities/structuredproperties/test_structuredproperties.py diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index fd3fe7ca098ecb..e37281dea86e1f 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -9,27 +9,18 @@ from datahub.configuration.common import ConfigModel from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.global_context import get_graph_context, set_graph_context -from datahub.ingestion.graph.client import DataHubGraph, get_default_graph +from datahub.ingestion.graph.client import DataHubGraph from datahub.metadata.schema_classes import ( PropertyValueClass, StructuredPropertyDefinitionClass, ) -from datahub.utilities.urns.urn import Urn +from datahub.metadata.urns import StructuredPropertyUrn, Urn +from datahub.utilities.urns._urn_base import URN_TYPES logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -class StructuredPropertiesConfig: - """Configuration class to hold the graph client""" - - @classmethod - def get_graph_required(cls) -> DataHubGraph: - """Get the current graph, falling back to default if none set""" - return get_graph_context() or get_default_graph() - - class AllowedTypes(Enum): STRING = "string" RICH_TEXT = "rich_text" @@ -51,29 +42,28 @@ class AllowedValue(ConfigModel): description: Optional[str] = None -VALID_ENTITY_TYPES_PREFIX_STRING = ", ".join( - [ - f"urn:li:entityType:datahub.{x}" - for x in ["dataset", "dashboard", "dataFlow", "schemaField"] - ] -) -VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {VALID_ENTITY_TYPES_PREFIX_STRING}, etc... Ensure that the entity type is valid." +VALID_ENTITY_TYPE_URNS = [ + Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES.keys() +] +_VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid." + + +def _validate_entity_type_urn(v: str) -> str: + urn = Urn.make_entity_type_urn(v) + if urn not in VALID_ENTITY_TYPE_URNS: + raise ValueError( + f"Input {v} is not a valid entity type urn. {_VALID_ENTITY_TYPES_STRING}" + ) + v = str(urn) + return v class TypeQualifierAllowedTypes(ConfigModel): allowed_types: List[str] - @validator("allowed_types", each_item=True) - def validate_allowed_types(cls, v): - if v: - graph = StructuredPropertiesConfig.get_graph_required() - validated_urn = Urn.make_entity_type_urn(v) - if not graph.exists(validated_urn): - raise ValueError( - f"Input {v} is not a valid entity type urn. {VALID_ENTITY_TYPES_STRING}" - ) - v = str(validated_urn) - return v + _check_allowed_types = validator("allowed_types", each_item=True, allow_reuse=True)( + _validate_entity_type_urn + ) class StructuredProperties(ConfigModel): @@ -90,22 +80,30 @@ class StructuredProperties(ConfigModel): type_qualifier: Optional[TypeQualifierAllowedTypes] = None immutable: Optional[bool] = False - @validator("entity_types", each_item=True) - def validate_entity_types(cls, v): - if v: - graph = StructuredPropertiesConfig.get_graph_required() - validated_urn = Urn.make_entity_type_urn(v) - if not graph.exists(validated_urn): - raise ValueError( - f"Input {v} is not a valid entity type urn. {VALID_ENTITY_TYPES_STRING}" - ) - v = str(validated_urn) + _check_entity_types = validator("entity_types", each_item=True, allow_reuse=True)( + _validate_entity_type_urn + ) + + @validator("type") + def validate_type(cls, v: str) -> str: + # Convert to lowercase if needed + if not v.islower(): + logger.warning( + f"Structured property type should be lowercase. Updated to {v.lower()}" + ) + v = v.lower() + + # Check if type is allowed + if not AllowedTypes.check_allowed_type(v): + raise ValueError( + f"Type {v} is not allowed. Allowed types are {AllowedTypes.values()}" + ) return v @property def fqn(self) -> str: assert self.urn is not None - id = Urn.create_from_string(self.urn).get_entity_id()[0] + id = StructuredPropertyUrn.from_string(self.urn).id if self.qualified_name is not None: # ensure that qualified name and ID match assert ( @@ -122,101 +120,90 @@ def urn_must_be_present(cls, v, values): return v @staticmethod - def create(file: str, graph: Optional[DataHubGraph] = None) -> None: - with set_graph_context(graph): - graph = StructuredPropertiesConfig.get_graph_required() - - with open(file) as fp: - structuredproperties: List[dict] = yaml.safe_load(fp) - for structuredproperty_raw in structuredproperties: - structuredproperty = StructuredProperties.parse_obj( - structuredproperty_raw - ) - - if not structuredproperty.type.islower(): - structuredproperty.type = structuredproperty.type.lower() - logger.warning( - f"Structured property type should be lowercase. Updated to {structuredproperty.type}" - ) - if not AllowedTypes.check_allowed_type(structuredproperty.type): - raise ValueError( - f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}" - ) - mcp = MetadataChangeProposalWrapper( - entityUrn=structuredproperty.urn, - aspect=StructuredPropertyDefinitionClass( - qualifiedName=structuredproperty.fqn, - valueType=Urn.make_data_type_urn(structuredproperty.type), - displayName=structuredproperty.display_name, - description=structuredproperty.description, - entityTypes=[ - Urn.make_entity_type_urn(entity_type) - for entity_type in structuredproperty.entity_types or [] - ], - cardinality=structuredproperty.cardinality, - immutable=structuredproperty.immutable, - allowedValues=( - [ - PropertyValueClass( - value=v.value, description=v.description - ) - for v in structuredproperty.allowed_values - ] - if structuredproperty.allowed_values - else None - ), - typeQualifier=( - { - "allowedTypes": structuredproperty.type_qualifier.allowed_types - } - if structuredproperty.type_qualifier - else None - ), - ), - ) - graph.emit_mcp(mcp) - - logger.info(f"Created structured property {structuredproperty.urn}") - - @classmethod - def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties": - with set_graph_context(graph): - structured_property: Optional[ - StructuredPropertyDefinitionClass - ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass) - if structured_property is None: - raise Exception( - "StructuredPropertyDefinition aspect is None. Unable to create structured property." - ) - return StructuredProperties( - urn=urn, - qualified_name=structured_property.qualifiedName, - display_name=structured_property.displayName, - type=structured_property.valueType, - description=structured_property.description, - entity_types=structured_property.entityTypes, - cardinality=structured_property.cardinality, - allowed_values=( + def from_yaml(file: str) -> List["StructuredProperties"]: + with open(file) as fp: + structuredproperties: List[dict] = yaml.safe_load(fp) + + result: List[StructuredProperties] = [] + for structuredproperty_raw in structuredproperties: + result.append(StructuredProperties.parse_obj(structuredproperty_raw)) + return result + + def generate_mcps(self) -> List[MetadataChangeProposalWrapper]: + mcp = MetadataChangeProposalWrapper( + entityUrn=self.urn, + aspect=StructuredPropertyDefinitionClass( + qualifiedName=self.fqn, + valueType=Urn.make_data_type_urn(self.type), + displayName=self.display_name, + description=self.description, + entityTypes=[ + Urn.make_entity_type_urn(entity_type) + for entity_type in self.entity_types or [] + ], + cardinality=self.cardinality, + immutable=self.immutable, + allowedValues=( [ - AllowedValue( - value=av.value, - description=av.description, - ) - for av in structured_property.allowedValues or [] + PropertyValueClass(value=v.value, description=v.description) + for v in self.allowed_values ] - if structured_property.allowedValues is not None + if self.allowed_values else None ), - type_qualifier=( - { - "allowed_types": structured_property.typeQualifier.get( - "allowedTypes" - ) - } - if structured_property.typeQualifier + typeQualifier=( + {"allowedTypes": self.type_qualifier.allowed_types} + if self.type_qualifier else None ), + ), + ) + return [mcp] + + @staticmethod + def create(file: str, graph: DataHubGraph) -> None: + # TODO: Deprecate this method. + structuredproperties = StructuredProperties.from_yaml(file) + for structuredproperty in structuredproperties: + for mcp in structuredproperty.generate_mcps(): + graph.emit_mcp(mcp) + + logger.info(f"Created structured property {structuredproperty.urn}") + + @classmethod + def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties": + structured_property: Optional[ + StructuredPropertyDefinitionClass + ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass) + if structured_property is None: + raise Exception( + "StructuredPropertyDefinition aspect is None. Unable to create structured property." ) + return StructuredProperties( + urn=urn, + qualified_name=structured_property.qualifiedName, + display_name=structured_property.displayName, + type=structured_property.valueType, + description=structured_property.description, + entity_types=structured_property.entityTypes, + cardinality=structured_property.cardinality, + allowed_values=( + [ + AllowedValue( + value=av.value, + description=av.description, + ) + for av in structured_property.allowedValues or [] + ] + if structured_property.allowedValues is not None + else None + ), + type_qualifier=( + {"allowed_types": structured_property.typeQualifier.get("allowedTypes")} + if structured_property.typeQualifier + else None + ), + ) def to_yaml( self, diff --git a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py index 4162d44b9b0ea8..42285cf13a5ddc 100644 --- a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py +++ b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py @@ -31,7 +31,8 @@ def properties() -> None: def upsert(file: Path) -> None: """Upsert structured properties in DataHub.""" - StructuredProperties.create(str(file)) + with get_default_graph() as graph: + StructuredProperties.create(str(file), graph) @properties.command( diff --git a/metadata-ingestion/tests/unit/api/entities/structuredproperties/__init__.py b/metadata-ingestion/tests/unit/api/entities/structuredproperties/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/tests/unit/api/entities/structuredproperties/example_structured_properties_golden.json b/metadata-ingestion/tests/unit/api/entities/structuredproperties/example_structured_properties_golden.json new file mode 100644 index 00000000000000..29386ece7b0ca1 --- /dev/null +++ b/metadata-ingestion/tests/unit/api/entities/structuredproperties/example_structured_properties_golden.json @@ -0,0 +1,194 @@ +[ +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:io.acryl.privacy.retentionTime", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "io.acryl.privacy.retentionTime", + "displayName": "Retention Time", + "valueType": "urn:li:dataType:datahub.number", + "allowedValues": [ + { + "value": { + "string": "30" + }, + "description": "30 days, usually reserved for datasets that are ephemeral and contain pii" + }, + { + "value": { + "string": "90" + }, + "description": "Use this for datasets that drive monthly reporting but contain pii" + }, + { + "value": { + "string": "365" + }, + "description": "Use this for non-sensitive data that can be retained for longer" + } + ], + "cardinality": "MULTIPLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.dataFlow" + ], + "description": "Retention Time is used to figure out how long to retain records in a dataset", + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.replicationSLA", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "io.acryl.dataManagement.replicationSLA", + "displayName": "Replication SLA", + "valueType": "urn:li:dataType:datahub.number", + "cardinality": "SINGLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset" + ], + "description": "SLA for how long data can be delayed before replicating to the destination cluster", + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.deprecationDate", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "io.acryl.dataManagement.deprecationDate", + "displayName": "Deprecation Date", + "valueType": "urn:li:dataType:datahub.date", + "cardinality": "SINGLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.dataFlow", + "urn:li:entityType:datahub.dataJob" + ], + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.steward", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "io.acryl.dataManagement.steward", + "displayName": "Steward", + "valueType": "urn:li:dataType:datahub.urn", + "typeQualifier": { + "allowedTypes": [ + "urn:li:entityType:datahub.corpuser", + "urn:li:entityType:datahub.corpGroup" + ] + }, + "cardinality": "SINGLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.dataFlow", + "urn:li:entityType:datahub.dataJob" + ], + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.certifier", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "io.acryl.dataManagement.certifier", + "displayName": "Person Certifying the asset", + "valueType": "urn:li:dataType:datahub.urn", + "cardinality": "SINGLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.schemaField" + ], + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.team", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "io.acryl.dataManagement.team", + "displayName": "Management team", + "valueType": "urn:li:dataType:datahub.string", + "cardinality": "SINGLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset" + ], + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:projectNames", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "projectNames", + "displayName": "Project names", + "valueType": "urn:li:dataType:datahub.string", + "allowedValues": [ + { + "value": { + "string": "Tracking" + }, + "description": "test value 1 for project" + }, + { + "value": { + "string": "DataHub" + }, + "description": "test value 2 for project" + } + ], + "cardinality": "MULTIPLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset" + ], + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:namespace", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "namespace", + "displayName": "Namespace", + "valueType": "urn:li:dataType:datahub.string", + "cardinality": "SINGLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset" + ], + "immutable": false + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/api/entities/structuredproperties/test_structuredproperties.py b/metadata-ingestion/tests/unit/api/entities/structuredproperties/test_structuredproperties.py new file mode 100644 index 00000000000000..e96b7c1f98437e --- /dev/null +++ b/metadata-ingestion/tests/unit/api/entities/structuredproperties/test_structuredproperties.py @@ -0,0 +1,38 @@ +import pathlib + +import pydantic +import pytest + +from datahub.api.entities.structuredproperties.structuredproperties import ( + StructuredProperties, + TypeQualifierAllowedTypes, +) +from tests.test_helpers.mce_helpers import check_goldens_stream + +RESOURCE_DIR = pathlib.Path(__file__).parent + + +def test_type_validation() -> None: + with pytest.raises(pydantic.ValidationError): + TypeQualifierAllowedTypes(allowed_types=["thisdoesnotexist"]) + + types = TypeQualifierAllowedTypes(allowed_types=["dataset"]) + assert types.allowed_types == ["urn:li:entityType:datahub.dataset"] + + +def test_structuredproperties_load(pytestconfig: pytest.Config) -> None: + example_properties_file = ( + pytestconfig.rootpath + / "examples/structured_properties/structured_properties.yaml" + ) + + properties = StructuredProperties.from_yaml(str(example_properties_file)) + mcps = [] + for property in properties: + mcps.extend(property.generate_mcps()) + + check_goldens_stream( + pytestconfig, + mcps, + golden_path=RESOURCE_DIR / "example_structured_properties_golden.json", + ) diff --git a/metadata-ingestion/tests/unit/serde/test_codegen.py b/metadata-ingestion/tests/unit/serde/test_codegen.py index 37ac35586950e1..98d62d5643ff2d 100644 --- a/metadata-ingestion/tests/unit/serde/test_codegen.py +++ b/metadata-ingestion/tests/unit/serde/test_codegen.py @@ -18,6 +18,7 @@ UpstreamClass, _Aspect, ) +from datahub.utilities.urns._urn_base import URN_TYPES _UPDATE_ENTITY_REGISTRY = os.getenv("UPDATE_ENTITY_REGISTRY", "false").lower() == "true" ENTITY_REGISTRY_PATH = pathlib.Path( @@ -165,3 +166,9 @@ def test_enum_options(): # This is mainly a sanity check to ensure that it doesn't do anything too crazy. env_options = get_enum_options(FabricTypeClass) assert "PROD" in env_options + + +def test_urn_types() -> None: + assert len(URN_TYPES) > 10 + for checked_type in ["dataset", "dashboard", "dataFlow", "schemaField"]: + assert checked_type in URN_TYPES From 2285436a62dcee0ab0c4e4104f5c984c9d8a7b96 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 18 Dec 2024 17:50:38 +0530 Subject: [PATCH 19/21] fix(ingest/gc): more logging, error handling, explicit flag (#12124) --- .../src/datahub/ingestion/api/source.py | 1 + .../datahub/ingestion/api/source_helpers.py | 2 +- .../datahub/ingestion/source/gc/datahub_gc.py | 54 +++++++++---------- .../source/gc/dataprocess_cleanup.py | 52 ++++++++++++------ .../source/gc/soft_deleted_entity_cleanup.py | 5 ++ 5 files changed, 67 insertions(+), 47 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index c80da04e481a9f..c3638635b19aac 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -184,6 +184,7 @@ def infos(self) -> LossyList[StructuredLogEntry]: @dataclass class SourceReport(Report): + event_not_produced_warn: bool = True events_produced: int = 0 events_produced_per_sec: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index 0c86e1cf47203f..7791ea2797be34 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -150,7 +150,7 @@ def auto_workunit_reporter(report: "SourceReport", stream: Iterable[T]) -> Itera report.report_workunit(wu) yield wu - if report.events_produced == 0: + if report.event_not_produced_warn and report.events_produced == 0: report.warning( title="No metadata was produced by the source", message="Please check the source configuration, filters, and permissions.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py index 814f65ecb45cf0..4eecbb4d9d7177 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py @@ -65,18 +65,18 @@ class DataHubGcSourceConfig(ConfigModel): description="Sleep between truncation monitoring.", ) - dataprocess_cleanup: Optional[DataProcessCleanupConfig] = Field( - default=None, + dataprocess_cleanup: DataProcessCleanupConfig = Field( + default_factory=DataProcessCleanupConfig, description="Configuration for data process cleanup", ) - soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanupConfig] = Field( - default=None, + soft_deleted_entities_cleanup: SoftDeletedEntitiesCleanupConfig = Field( + default_factory=SoftDeletedEntitiesCleanupConfig, description="Configuration for soft deleted entities cleanup", ) - execution_request_cleanup: Optional[DatahubExecutionRequestCleanupConfig] = Field( - default=None, + execution_request_cleanup: DatahubExecutionRequestCleanupConfig = Field( + default_factory=DatahubExecutionRequestCleanupConfig, description="Configuration for execution request cleanup", ) @@ -108,28 +108,22 @@ def __init__(self, ctx: PipelineContext, config: DataHubGcSourceConfig): self.ctx = ctx self.config = config self.report = DataHubGcSourceReport() + self.report.event_not_produced_warn = False self.graph = ctx.require_graph("The DataHubGc source") - self.dataprocess_cleanup: Optional[DataProcessCleanup] = None - self.soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanup] = None - self.execution_request_cleanup: Optional[DatahubExecutionRequestCleanup] = None - - if self.config.dataprocess_cleanup: - self.dataprocess_cleanup = DataProcessCleanup( - ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run - ) - if self.config.soft_deleted_entities_cleanup: - self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup( - ctx, - self.config.soft_deleted_entities_cleanup, - self.report, - self.config.dry_run, - ) - if self.config.execution_request_cleanup: - self.execution_request_cleanup = DatahubExecutionRequestCleanup( - config=self.config.execution_request_cleanup, - graph=self.graph, - report=self.report, - ) + self.dataprocess_cleanup = DataProcessCleanup( + ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run + ) + self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup( + ctx, + self.config.soft_deleted_entities_cleanup, + self.report, + self.config.dry_run, + ) + self.execution_request_cleanup = DatahubExecutionRequestCleanup( + config=self.config.execution_request_cleanup, + graph=self.graph, + report=self.report, + ) @classmethod def create(cls, config_dict, ctx): @@ -153,19 +147,19 @@ def get_workunits_internal( self.truncate_indices() except Exception as e: self.report.failure("While trying to truncate indices ", exc=e) - if self.soft_deleted_entities_cleanup: + if self.config.soft_deleted_entities_cleanup.enabled: try: self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() except Exception as e: self.report.failure( "While trying to cleanup soft deleted entities ", exc=e ) - if self.execution_request_cleanup: + if self.config.execution_request_cleanup.enabled: try: self.execution_request_cleanup.run() except Exception as e: self.report.failure("While trying to cleanup execution request ", exc=e) - if self.dataprocess_cleanup: + if self.config.dataprocess_cleanup.enabled: try: yield from self.dataprocess_cleanup.get_workunits_internal() except Exception as e: diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py index 8aacf13cdb00fb..6d16aaab2d7980 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py @@ -98,6 +98,9 @@ class DataProcessCleanupConfig(ConfigModel): + enabled: bool = Field( + default=True, description="Whether to do data process cleanup." + ) retention_days: Optional[int] = Field( 10, description="Number of days to retain metadata in DataHub", @@ -371,17 +374,26 @@ def get_data_flows(self) -> Iterable[DataFlowEntity]: previous_scroll_id: Optional[str] = None while True: - result = self.ctx.graph.execute_graphql( - DATAFLOW_QUERY, - { - "query": "*", - "scrollId": scroll_id if scroll_id else None, - "batchSize": self.config.batch_size, - }, - ) + result = None + try: + result = self.ctx.graph.execute_graphql( + DATAFLOW_QUERY, + { + "query": "*", + "scrollId": scroll_id if scroll_id else None, + "batchSize": self.config.batch_size, + }, + ) + except Exception as e: + self.report.failure( + f"While trying to get dataflows with {scroll_id}", exc=e + ) + break + scrollAcrossEntities = result.get("scrollAcrossEntities") if not scrollAcrossEntities: raise ValueError("Missing scrollAcrossEntities in response") + logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities") scroll_id = scrollAcrossEntities.get("nextScrollId") for flow in scrollAcrossEntities.get("searchResults"): @@ -398,6 +410,8 @@ def get_data_flows(self) -> Iterable[DataFlowEntity]: previous_scroll_id = scroll_id def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + if not self.config.enabled: + return [] assert self.ctx.graph dataFlows: Dict[str, DataFlowEntity] = {} @@ -411,14 +425,20 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: deleted_jobs: int = 0 while True: - result = self.ctx.graph.execute_graphql( - DATAJOB_QUERY, - { - "query": "*", - "scrollId": scroll_id if scroll_id else None, - "batchSize": self.config.batch_size, - }, - ) + try: + result = self.ctx.graph.execute_graphql( + DATAJOB_QUERY, + { + "query": "*", + "scrollId": scroll_id if scroll_id else None, + "batchSize": self.config.batch_size, + }, + ) + except Exception as e: + self.report.failure( + f"While trying to get data jobs with {scroll_id}", exc=e + ) + break scrollAcrossEntities = result.get("scrollAcrossEntities") if not scrollAcrossEntities: raise ValueError("Missing scrollAcrossEntities in response") diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py index bb4ab753543b7b..93f004ab675edc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py @@ -20,6 +20,9 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel): + enabled: bool = Field( + default=True, description="Whether to do soft deletion cleanup." + ) retention_days: Optional[int] = Field( 10, description="Number of days to retain metadata in DataHub", @@ -156,6 +159,8 @@ def delete_soft_deleted_entity(self, urn: str) -> None: self.delete_entity(urn) def cleanup_soft_deleted_entities(self) -> None: + if not self.config.enabled: + return assert self.ctx.graph start_time = time.time() From 01a2c0c77944759c779ae06dc44198f956ab2da9 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Wed, 18 Dec 2024 19:02:44 +0530 Subject: [PATCH 20/21] fix(ingest/kafka): update dependency, tests (#12159) --- metadata-ingestion/setup.py | 2 +- metadata-ingestion/tests/integration/kafka/test_kafka.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 31db711592eb14..6334b3abbb8a01 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -76,7 +76,7 @@ # now provide prebuilt wheels for most platforms, including M1 Macs and # Linux aarch64 (e.g. Docker's linux/arm64). Installing confluent_kafka # from source remains a pain. - "confluent_kafka>=1.9.0", + "confluent_kafka[schemaregistry]>=1.9.0", # We currently require both Avro libraries. The codegen uses avro-python3 (above) # schema parsers at runtime for generating and reading JSON into Python objects. # At the same time, we use Kafka's AvroSerializer, which internally relies on diff --git a/metadata-ingestion/tests/integration/kafka/test_kafka.py b/metadata-ingestion/tests/integration/kafka/test_kafka.py index 0d9a714625e96b..648c4b26b20a76 100644 --- a/metadata-ingestion/tests/integration/kafka/test_kafka.py +++ b/metadata-ingestion/tests/integration/kafka/test_kafka.py @@ -102,7 +102,7 @@ def test_kafka_test_connection(mock_kafka_service, config_dict, is_success): test_connection_helpers.assert_capability_report( capability_report=report.capability_report, failure_capabilities={ - SourceCapability.SCHEMA_METADATA: "Failed to establish a new connection" + SourceCapability.SCHEMA_METADATA: "[Errno 111] Connection refused" }, ) From 8c724dbf47dd76a4aefec0a93267e08ddeda7e58 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 18 Dec 2024 12:45:38 -0600 Subject: [PATCH 21/21] feat(api): authorization extended for soft-delete and suspend (#12158) --- datahub-frontend/app/auth/AuthModule.java | 2 + .../upgrade/config/SystemUpdateConfig.java | 2 + .../restorebackup/RestoreStorageStep.java | 2 +- .../upgrade/system/AbstractMCLStep.java | 3 +- .../bootstrapmcps/BootstrapMCPUtil.java | 4 +- ...ateSchemaFieldsFromSchemaMetadataStep.java | 10 +- ...chemaFieldsFromSchemaMetadataStepTest.java | 3 +- .../aspect/CachingAspectRetriever.java | 36 +++- .../metadata/aspect/GraphRetriever.java | 23 +++ .../metadata/entity/SearchRetriever.java | 19 ++ .../metadata/aspect/MockAspectRetriever.java | 4 +- .../java/com/linkedin/metadata/Constants.java | 2 + .../ebean/batch/AspectsBatchImplTest.java | 8 +- .../aspect/utils/DefaultAspectsUtil.java | 2 +- .../client/EntityClientAspectRetriever.java | 7 +- .../metadata/client/JavaEntityClient.java | 21 ++- .../client/SystemJavaEntityClient.java | 2 +- .../entity/EntityServiceAspectRetriever.java | 10 +- .../metadata/entity/EntityServiceImpl.java | 67 +++---- .../linkedin/metadata/entity/EntityUtils.java | 2 +- .../cassandra/CassandraRetentionService.java | 2 +- .../entity/ebean/EbeanRetentionService.java | 2 +- .../query/filter/BaseQueryFilterRewriter.java | 2 +- .../SearchDocumentTransformer.java | 2 - .../BusinessAttributeUpdateHookService.java | 4 +- .../service/UpdateGraphIndicesService.java | 3 +- .../service/UpdateIndicesService.java | 5 +- .../metadata/AspectIngestionUtils.java | 12 +- .../hooks/IgnoreUnknownMutatorTest.java | 12 +- .../aspect/utils/DefaultAspectsUtilTest.java | 3 +- .../DataProductUnsetSideEffectTest.java | 8 +- .../entity/EbeanEntityServiceTest.java | 36 ++-- .../metadata/entity/EntityServiceTest.java | 118 ++++++------ .../cassandra/CassandraEntityServiceTest.java | 11 +- .../ebean/batch/ChangeItemImplTest.java | 4 +- .../RecommendationsServiceTest.java | 3 +- .../SchemaFieldSideEffectTest.java | 12 +- .../ContainerExpansionRewriterTest.java | 5 +- .../filter/DomainExpansionRewriterTest.java | 9 +- .../request/AggregationQueryBuilderTest.java | 9 +- .../request/SearchRequestHandlerTest.java | 1 + .../SearchDocumentTransformerTest.java | 12 ++ ...ropertyDefinitionDeleteSideEffectTest.java | 12 +- .../ShowPropertyAsBadgeValidatorTest.java | 2 +- .../io/datahubproject/test/DataGenerator.java | 5 +- .../MCLSpringCommonTestConfiguration.java | 3 +- .../hook/BusinessAttributeUpdateHookTest.java | 16 +- .../metadata/context/ActorContext.java | 48 +++++ .../metadata/context/OperationContext.java | 123 ++++++++----- .../metadata/context/RetrieverContext.java | 29 +++ .../exception/ActorAccessException.java | 7 + .../exception/OperationContextException.java | 9 + .../context/TestOperationContexts.java | 139 ++++++-------- .../context/OperationContextTest.java | 3 +- .../token/StatefulTokenService.java | 2 +- .../src/main/resources/application.yaml | 6 +- .../SystemOperationContextFactory.java | 14 +- .../IngestDataPlatformInstancesStep.java | 4 +- .../boot/steps/IngestPoliciesStep.java | 2 +- .../GlobalControllerExceptionHandler.java | 14 +- .../controller/GenericEntitiesController.java | 8 +- .../openapi/operations/test/IdController.java | 54 ++++++ .../openapi/util/MappingUtil.java | 2 +- .../v2/controller/EntityController.java | 4 +- .../v3/controller/EntityController.java | 4 +- ...m.linkedin.entity.entitiesV2.restspec.json | 8 + ...m.linkedin.entity.entitiesV2.snapshot.json | 8 + .../linkedin/entity/client/EntityClient.java | 71 ++++++- .../entity/client/RestliEntityClient.java | 13 +- .../client/SystemRestliEntityClient.java | 2 +- .../resources/entity/AspectResource.java | 2 +- .../resources/entity/EntityV2Resource.java | 10 +- .../resources/restli/RestliConstants.java | 3 + .../resources/restli/RestliUtils.java | 8 + .../resources/entity/AspectResourceTest.java | 2 +- .../tokens/revokable_access_token_test.py | 44 +---- .../tests/tokens/session_access_token_test.py | 173 ++++++++++++++++++ smoke-test/tests/tokens/token_utils.py | 53 ++++++ 78 files changed, 980 insertions(+), 431 deletions(-) create mode 100644 metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java create mode 100644 metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java rename metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/{ => config}/GlobalControllerExceptionHandler.java (81%) create mode 100644 metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java create mode 100644 smoke-test/tests/tokens/session_access_token_test.py create mode 100644 smoke-test/tests/tokens/token_utils.py diff --git a/datahub-frontend/app/auth/AuthModule.java b/datahub-frontend/app/auth/AuthModule.java index 7fa99ab3cb2621..b95515684f01fc 100644 --- a/datahub-frontend/app/auth/AuthModule.java +++ b/datahub-frontend/app/auth/AuthModule.java @@ -27,6 +27,7 @@ import io.datahubproject.metadata.context.EntityRegistryContext; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.OperationContextConfig; +import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.SearchContext; import io.datahubproject.metadata.context.ValidationContext; import java.nio.charset.StandardCharsets; @@ -195,6 +196,7 @@ protected OperationContext provideOperationContext( .searchContext(SearchContext.EMPTY) .entityRegistryContext(EntityRegistryContext.builder().build(EmptyEntityRegistry.EMPTY)) .validationContext(ValidationContext.builder().alternateValidation(false).build()) + .retrieverContext(RetrieverContext.EMPTY) .build(systemAuthentication); } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java index 661717c6309cfc..fdd84da6044f73 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java @@ -13,6 +13,7 @@ import com.linkedin.gms.factory.kafka.common.TopicConventionFactory; import com.linkedin.gms.factory.kafka.schemaregistry.InternalSchemaRegistryFactory; import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.config.kafka.KafkaConfiguration; import com.linkedin.metadata.dao.producer.KafkaEventProducer; import com.linkedin.metadata.dao.producer.KafkaHealthChecker; @@ -186,6 +187,7 @@ protected OperationContext javaSystemOperationContext( components.getIndexConvention(), RetrieverContext.builder() .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(CachingAspectRetriever.EMPTY) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java index 4d53b603c1eaff..1e5cd6cdb24174 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java @@ -180,7 +180,7 @@ private void readerExecutable(ReaderWrapper reader, UpgradeContext context) { try { aspectRecord = EntityUtils.toSystemAspect( - context.opContext().getRetrieverContext().get(), aspect.toEntityAspect()) + context.opContext().getRetrieverContext(), aspect.toEntityAspect()) .get() .getRecordTemplate(); } catch (Exception e) { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java index cd7947ce3c11aa..56feffd211bcd7 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java @@ -113,8 +113,7 @@ public Function executable() { List, SystemAspect>> futures; futures = EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), - batch.collect(Collectors.toList())) + opContext.getRetrieverContext(), batch.collect(Collectors.toList())) .stream() .map( systemAspect -> { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java index 4cc3edff3eb52d..5b807c6c450afb 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java @@ -100,8 +100,8 @@ static AspectsBatch generateAspectBatch( .collect(Collectors.toList()); return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) - .retrieverContext(opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java index 55bc8edbf6a768..de03538907432f 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java @@ -168,13 +168,13 @@ public Function executable() { AspectsBatch aspectsBatch = AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( batch .flatMap( ebeanAspectV2 -> EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), Set.of(ebeanAspectV2)) .stream()) .map( @@ -189,11 +189,7 @@ public Function executable() { .auditStamp(systemAspect.getAuditStamp()) .systemMetadata( withAppSource(systemAspect.getSystemMetadata())) - .build( - opContext - .getRetrieverContext() - .get() - .getAspectRetriever())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList())) .build(); diff --git a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java index 3a2728b4e1d3d6..04b1095e770e0e 100644 --- a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java +++ b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java @@ -22,7 +22,6 @@ import com.linkedin.upgrade.DataHubUpgradeState; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RetrieverContext; -import java.util.Optional; import java.util.stream.Stream; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -48,7 +47,7 @@ public void setup() { step = new GenerateSchemaFieldsFromSchemaMetadataStep( mockOpContext, mockEntityService, mockAspectDao, 10, 100, 1000); - when(mockOpContext.getRetrieverContext()).thenReturn(Optional.of(mockRetrieverContext)); + when(mockOpContext.getRetrieverContext()).thenReturn(mockRetrieverContext); } /** Test to verify the correct step ID is returned. */ diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java index 77e799f752455c..375dd8cf8911e1 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java @@ -1,4 +1,38 @@ package com.linkedin.metadata.aspect; +import com.linkedin.common.urn.Urn; +import com.linkedin.entity.Aspect; +import com.linkedin.metadata.models.registry.EmptyEntityRegistry; +import com.linkedin.metadata.models.registry.EntityRegistry; +import java.util.Collections; +import java.util.Map; +import java.util.Set; +import javax.annotation.Nonnull; + /** Responses can be cached based on application.yaml caching configuration for the EntityClient */ -public interface CachingAspectRetriever extends AspectRetriever {} +public interface CachingAspectRetriever extends AspectRetriever { + + CachingAspectRetriever EMPTY = new EmptyAspectRetriever(); + + class EmptyAspectRetriever implements CachingAspectRetriever { + @Nonnull + @Override + public Map> getLatestAspectObjects( + Set urns, Set aspectNames) { + return Collections.emptyMap(); + } + + @Nonnull + @Override + public Map> getLatestSystemAspects( + Map> urnAspectNames) { + return Collections.emptyMap(); + } + + @Nonnull + @Override + public EntityRegistry getEntityRegistry() { + return EmptyEntityRegistry.EMPTY; + } + } +} diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java index f6858e7da4ba63..30a2c1eb9df8c1 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java @@ -4,6 +4,7 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.RelationshipFilter; import com.linkedin.metadata.query.filter.SortCriterion; +import java.util.Collections; import java.util.List; import java.util.function.Function; import javax.annotation.Nonnull; @@ -97,4 +98,26 @@ default void consumeRelatedEntities( } } } + + GraphRetriever EMPTY = new EmptyGraphRetriever(); + + class EmptyGraphRetriever implements GraphRetriever { + + @Nonnull + @Override + public RelatedEntitiesScrollResult scrollRelatedEntities( + @Nullable List sourceTypes, + @Nonnull Filter sourceEntityFilter, + @Nullable List destinationTypes, + @Nonnull Filter destinationEntityFilter, + @Nonnull List relationshipTypes, + @Nonnull RelationshipFilter relationshipFilter, + @Nonnull List sortCriterion, + @Nullable String scrollId, + int count, + @Nullable Long startTimeMillis, + @Nullable Long endTimeMillis) { + return new RelatedEntitiesScrollResult(0, 0, null, Collections.emptyList()); + } + } } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java index eaa106b8d1f638..d4894c97015f8f 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java @@ -2,6 +2,7 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.search.ScrollResult; +import com.linkedin.metadata.search.SearchEntityArray; import java.util.List; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -21,4 +22,22 @@ ScrollResult scroll( @Nullable Filter filters, @Nullable String scrollId, int count); + + SearchRetriever EMPTY = new EmptySearchRetriever(); + + class EmptySearchRetriever implements SearchRetriever { + + @Override + public ScrollResult scroll( + @Nonnull List entities, + @Nullable Filter filters, + @Nullable String scrollId, + int count) { + ScrollResult empty = new ScrollResult(); + empty.setEntities(new SearchEntityArray()); + empty.setNumEntities(0); + empty.setPageSize(0); + return empty; + } + } } diff --git a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java index 65705f15022b6b..98a6d59004a92a 100644 --- a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java +++ b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java @@ -5,7 +5,7 @@ import com.linkedin.data.DataMap; import com.linkedin.data.template.RecordTemplate; import com.linkedin.entity.Aspect; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.mxe.SystemMetadata; @@ -22,7 +22,7 @@ import javax.annotation.Nonnull; import org.mockito.Mockito; -public class MockAspectRetriever implements AspectRetriever { +public class MockAspectRetriever implements CachingAspectRetriever { private final Map> data; private final Map> systemData = new HashMap<>(); diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java index ff6a79108600a3..09f873ebf7bc96 100644 --- a/li-utils/src/main/java/com/linkedin/metadata/Constants.java +++ b/li-utils/src/main/java/com/linkedin/metadata/Constants.java @@ -409,6 +409,8 @@ public class Constants { /** User Status */ public static final String CORP_USER_STATUS_ACTIVE = "ACTIVE"; + public static final String CORP_USER_STATUS_SUSPENDED = "SUSPENDED"; + /** Task Runs */ public static final String DATA_PROCESS_INSTANCE_ENTITY_NAME = "dataProcessInstance"; diff --git a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java index 9f57d36f800de3..a3099b9ee21ea4 100644 --- a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java +++ b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java @@ -16,7 +16,7 @@ import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.dataset.DatasetProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.patch.GenericJsonPatch; @@ -56,7 +56,7 @@ public class AspectsBatchImplTest { private EntityRegistry testRegistry; - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeTest @@ -75,12 +75,12 @@ public void beforeTest() throws EntityRegistryException { @BeforeMethod public void setup() { - this.mockAspectRetriever = mock(AspectRetriever.class); + this.mockAspectRetriever = mock(CachingAspectRetriever.class); when(this.mockAspectRetriever.getEntityRegistry()).thenReturn(testRegistry); this.retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .graphRetriever(mock(GraphRetriever.class)) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java index 99eadd223acd1a..82bc0ae1409c52 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java @@ -137,7 +137,7 @@ public static List getAdditionalChanges( getProposalFromAspectForDefault( entry.getKey(), entry.getValue(), entityKeyAspect, templateItem), templateItem.getAuditStamp(), - opContext.getAspectRetrieverOpt().get())) + opContext.getAspectRetriever())) .filter(Objects::nonNull); }) .collect(Collectors.toList()); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java b/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java index bba8324d0c5612..669ec751f87c69 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java @@ -35,7 +35,7 @@ public EntityRegistry getEntityRegistry() { @Override public Aspect getLatestAspectObject(@Nonnull Urn urn, @Nonnull String aspectName) { try { - return entityClient.getLatestAspectObject(systemOperationContext, urn, aspectName); + return entityClient.getLatestAspectObject(systemOperationContext, urn, aspectName, false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } @@ -49,7 +49,7 @@ public Map> getLatestAspectObjects( return Map.of(); } else { try { - return entityClient.getLatestAspects(systemOperationContext, urns, aspectNames); + return entityClient.getLatestAspects(systemOperationContext, urns, aspectNames, false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } @@ -70,7 +70,8 @@ public Map> getLatestSystemAspects( urnAspectNames.keySet(), urnAspectNames.values().stream() .flatMap(Collection::stream) - .collect(Collectors.toSet())); + .collect(Collectors.toSet()), + false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index 29faa3955ea662..3d35f5956b0f4f 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -106,11 +106,17 @@ public EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final Set projectedAspects = aspectNames == null ? opContext.getEntityAspectNames(entityName) : aspectNames; - return entityService.getEntityV2(opContext, entityName, urn, projectedAspects); + return entityService.getEntityV2( + opContext, + entityName, + urn, + projectedAspects, + alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } @Override @@ -126,7 +132,8 @@ public Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull Set urns, - @Nullable Set aspectNames) + @Nullable Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final Set projectedAspects = aspectNames == null ? opContext.getEntityAspectNames(entityName) : aspectNames; @@ -139,7 +146,11 @@ public Map batchGetV2( try { responseMap.putAll( entityService.getEntitiesV2( - opContext, entityName, new HashSet<>(batch), projectedAspects)); + opContext, + entityName, + new HashSet<>(batch), + projectedAspects, + alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect)); } catch (URISyntaxException e) { throw new RuntimeException(e); } @@ -772,7 +783,7 @@ public List batchIngestProposals( .mcps( batch, auditStamp, - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), opContext.getValidationContext().isAlternateValidation()) .build(); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java index eda9b3a880228f..1d2fd422d7f460 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java @@ -89,6 +89,6 @@ public Map batchGetV2NoCache( @Nonnull Set urns, @Nullable Set aspectNames) throws RemoteInvocationException, URISyntaxException { - return super.batchGetV2(opContext, entityName, urns, aspectNames); + return super.batchGetV2(opContext, entityName, urns, aspectNames, false); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java index 626a1f72f5fb73..50cf8af30d606a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java @@ -5,7 +5,7 @@ import com.linkedin.common.urn.Urn; import com.linkedin.entity.Aspect; -import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.AspectRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.models.registry.EntityRegistry; import io.datahubproject.metadata.context.OperationContext; @@ -22,7 +22,7 @@ @Getter @Builder -public class EntityServiceAspectRetriever implements CachingAspectRetriever { +public class EntityServiceAspectRetriever implements AspectRetriever { @Setter private OperationContext systemOperationContext; private final EntityRegistry entityRegistry; @@ -46,7 +46,8 @@ public Map> getLatestAspectObjects( String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); try { return entityResponseToAspectMap( - entityService.getEntitiesV2(systemOperationContext, entityName, urns, aspectNames)); + entityService.getEntitiesV2( + systemOperationContext, entityName, urns, aspectNames, false)); } catch (URISyntaxException e) { throw new RuntimeException(e); } @@ -71,7 +72,8 @@ public Map> getLatestSystemAspects( urnAspectNames.keySet(), urnAspectNames.values().stream() .flatMap(Collection::stream) - .collect(Collectors.toSet())), + .collect(Collectors.toSet()), + false), entityRegistry); } catch (URISyntaxException e) { throw new RuntimeException(e); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java index 6de7784bfbc0ec..8ae09111204cab 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java @@ -261,8 +261,7 @@ public Map> getLatestAspects( } List systemAspects = - EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), batchGetResults.values()); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), batchGetResults.values()); systemAspects.stream() // for now, don't add the key aspect here we have already added it above @@ -290,8 +289,7 @@ public Map getLatestAspectsForUrn( Map batchGetResults = getLatestAspect(opContext, new HashSet<>(Arrays.asList(urn)), aspectNames, forUpdate); - return EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), batchGetResults.values()) + return EntityUtils.toSystemAspects(opContext.getRetrieverContext(), batchGetResults.values()) .stream() .map( systemAspect -> Pair.of(systemAspect.getAspectName(), systemAspect.getRecordTemplate())) @@ -335,7 +333,7 @@ public Pair getAspectVersionPair( final Optional maybeAspect = Optional.ofNullable(aspectDao.getAspect(primaryKey)); return Pair.of( - EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), maybeAspect.orElse(null)) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), maybeAspect.orElse(null)) .map(SystemAspect::getRecordTemplate) .orElse(null), version); @@ -721,7 +719,7 @@ public ListResult listLatestAspects( } return new ListResult<>( - EntityUtils.toSystemAspects(opContext.getRetrieverContext().get(), entityAspects).stream() + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), entityAspects).stream() .map(SystemAspect::getRecordTemplate) .collect(Collectors.toList()), aspectMetadataList.getMetadata(), @@ -758,12 +756,12 @@ public List ingestAspects( .recordTemplate(pair.getValue()) .systemMetadata(systemMetadata) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()); return ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -815,13 +813,13 @@ private void processPostCommitMCLSideEffects( log.debug("Considering {} MCLs post commit side effects.", mcls.size()); List batch = mcls.stream() - .map(mcl -> MCLItemImpl.builder().build(mcl, opContext.getAspectRetrieverOpt().get())) + .map(mcl -> MCLItemImpl.builder().build(mcl, opContext.getAspectRetriever())) .collect(Collectors.toList()); Iterable> iterable = () -> Iterators.partition( - AspectsBatch.applyPostMCPSideEffects(batch, opContext.getRetrieverContext().get()) + AspectsBatch.applyPostMCPSideEffects(batch, opContext.getRetrieverContext()) .iterator(), MCP_SIDE_EFFECT_KAFKA_BATCH_SIZE); StreamSupport.stream(iterable.spliterator(), false) @@ -831,7 +829,7 @@ private void processPostCommitMCLSideEffects( ingestProposalAsync( AspectsBatchImpl.builder() .items(sideEffects) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build()) .count(); log.info("Generated {} MCP SideEffects for async processing", count); @@ -879,8 +877,7 @@ private List ingestAspectsToLocalDB( aspectDao.getLatestAspects(urnAspects, true); final Map> batchAspects = - EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), databaseAspects); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), databaseAspects); // read #2 (potentially) final Map> nextVersions = @@ -903,7 +900,7 @@ private List ingestAspectsToLocalDB( Map> newLatestAspects = EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getLatestAspects(updatedItems.getFirst(), true)); // merge updatedLatestAspects = AspectsBatch.merge(batchAspects, newLatestAspects); @@ -941,7 +938,7 @@ private List ingestAspectsToLocalDB( // do final pre-commit checks with previous aspect value ValidationExceptionCollection exceptions = - AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext().get()); + AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext()); if (exceptions.hasFatalExceptions()) { // IF this is a client request/API request we fail the `transaction batch` @@ -1143,8 +1140,8 @@ public RecordTemplate ingestAspectIfNotPresent( .recordTemplate(newValue) .systemMetadata(systemMetadata) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get()), - opContext.getRetrieverContext().get()) + .build(opContext.getAspectRetriever()), + opContext.getRetrieverContext()) .build(); List ingested = ingestAspects(opContext, aspectsBatch, true, false); @@ -1169,7 +1166,7 @@ public IngestResult ingestProposal( return ingestProposal( opContext, AspectsBatchImpl.builder() - .mcps(List.of(proposal), auditStamp, opContext.getRetrieverContext().get()) + .mcps(List.of(proposal), auditStamp, opContext.getRetrieverContext()) .build(), async) .stream() @@ -1246,7 +1243,7 @@ private Stream ingestTimeseriesProposal( .recordTemplate( EntityApiUtils.buildKeyAspect( opContext.getEntityRegistry(), item.getUrn())) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()); ingestProposalSync( @@ -1469,7 +1466,7 @@ public List restoreIndices( List systemAspects = EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), batch.collect(Collectors.toList())); + opContext.getRetrieverContext(), batch.collect(Collectors.toList())); RestoreIndicesResult result = restoreIndices(opContext, systemAspects, logger); result.timeSqlQueryMs = timeSqlQueryMs; @@ -1513,7 +1510,7 @@ public List restoreIndices( long startTime = System.currentTimeMillis(); List systemAspects = EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), getLatestAspect(opContext, entityBatch.getValue(), aspectNames, false).values()); long timeSqlQueryMs = System.currentTimeMillis() - startTime; @@ -1649,12 +1646,12 @@ private RestoreIndicesResult restoreIndices( .auditStamp(auditStamp) .systemMetadata(latestSystemMetadata) .recordTemplate(EntityApiUtils.buildKeyAspect(opContext.getEntityRegistry(), urn)) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); Stream defaultAspectsResult = ingestProposalSync( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(keyAspect) .build()); defaultAspectsCreated += defaultAspectsResult.count(); @@ -1966,7 +1963,7 @@ private void ingestSnapshotUnion( AspectsBatchImpl aspectsBatch = AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( aspectRecordsToIngest.stream() .map( @@ -1977,7 +1974,7 @@ private void ingestSnapshotUnion( .recordTemplate(pair.getValue()) .auditStamp(auditStamp) .systemMetadata(systemMetadata) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList())) .build(); @@ -2128,7 +2125,7 @@ public RollbackRunResult deleteUrn(@Nonnull OperationContext opContext, Urn urn) } SystemMetadata latestKeySystemMetadata = - EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), latestKey) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), latestKey) .map(SystemAspect::getSystemMetadata) .get(); RollbackResult result = @@ -2253,11 +2250,11 @@ private RollbackResult deleteAspectWithoutMCL( .urn(entityUrn) .aspectName(aspectName) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get()); + .build(opContext.getAspectRetriever()); // Delete validation hooks ValidationExceptionCollection exceptions = - AspectsBatch.validateProposed(List.of(deleteItem), opContext.getRetrieverContext().get()); + AspectsBatch.validateProposed(List.of(deleteItem), opContext.getRetrieverContext()); if (!exceptions.isEmpty()) { throw new ValidationException(collectMetrics(exceptions).toString()); } @@ -2271,7 +2268,7 @@ private RollbackResult deleteAspectWithoutMCL( final EntityAspect.EntitySystemAspect latest = (EntityAspect.EntitySystemAspect) EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getLatestAspect(urn, aspectName, false)) .orElse(null); @@ -2299,7 +2296,7 @@ private RollbackResult deleteAspectWithoutMCL( EntityAspect.EntitySystemAspect candidateAspect = (EntityAspect.EntitySystemAspect) EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getAspect(urn, aspectName, maxVersion)) .orElse(null); SystemMetadata previousSysMetadata = @@ -2325,13 +2322,9 @@ private RollbackResult deleteAspectWithoutMCL( .urn(UrnUtils.getUrn(toDelete.getUrn())) .aspectName(toDelete.getAspect()) .auditStamp(auditStamp) - .build( - opContext - .getRetrieverContext() - .get() - .getAspectRetriever())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()), - opContext.getRetrieverContext().get()); + opContext.getRetrieverContext()); if (!preCommitExceptions.isEmpty()) { throw new ValidationException(collectMetrics(preCommitExceptions).toString()); } @@ -2509,7 +2502,7 @@ private Map getEnvelopedAspects( final Map dbEntries = aspectDao.batchGet(dbKeys, false); List envelopedAspects = - EntityUtils.toSystemAspects(opContext.getRetrieverContext().get(), dbEntries.values()); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), dbEntries.values()); return envelopedAspects.stream() .collect( diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java index 3c4109970e9d0b..da48a2b76d6d56 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java @@ -72,7 +72,7 @@ public static void ingestChangeProposals( entityService.ingestProposal( opContext, AspectsBatchImpl.builder() - .mcps(changes, getAuditStamp(actor), opContext.getRetrieverContext().get()) + .mcps(changes, getAuditStamp(actor), opContext.getRetrieverContext()) .build(), async); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java index ccc1910ba5cdbd..c595e3e07b8342 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java @@ -64,7 +64,7 @@ protected AspectsBatch buildAspectsBatch( List mcps, @Nonnull AuditStamp auditStamp) { return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java index 49fa555e006f61..74d0d8b0964de0 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java @@ -59,7 +59,7 @@ protected AspectsBatch buildAspectsBatch( List mcps, @Nonnull AuditStamp auditStamp) { return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java index 367705d369c7ce..6c5c6243d33620 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java @@ -143,7 +143,7 @@ private static QueryBuilder expandTerms( if (!queryUrns.isEmpty()) { scrollGraph( - opContext.getRetrieverContext().get().getGraphRetriever(), + opContext.getRetrieverContext().getGraphRetriever(), queryUrns, relationshipTypes, relationshipDirection, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java index 4bb8e0630de480..b4ad847cb7afc2 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java @@ -437,8 +437,6 @@ private void setStructuredPropertiesSearchValue( Map> definitions = opContext - .getRetrieverContext() - .get() .getAspectRetriever() .getLatestAspectObjects( propertyMap.keySet(), Set.of(STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME)); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java index ad2825ead3d0da..4a692e95346222 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java @@ -112,7 +112,7 @@ private void fetchRelatedEntities( @Nullable String scrollId, int consumedEntityCount, int batchNumber) { - GraphRetriever graph = opContext.getRetrieverContext().get().getGraphRetriever(); + GraphRetriever graph = opContext.getRetrieverContext().getGraphRetriever(); final ArrayList> futureList = new ArrayList<>(); RelatedEntitiesScrollResult result = graph.scrollRelatedEntities( @@ -165,7 +165,7 @@ private Callable processBatch( return () -> { StopWatch stopWatch = new StopWatch(); stopWatch.start(); - AspectRetriever aspectRetriever = opContext.getAspectRetrieverOpt().get(); + AspectRetriever aspectRetriever = opContext.getAspectRetriever(); log.info("Batch {} for BA:{} started", batchNumber, entityKey); ExecutionResult executionResult = new ExecutionResult(); executionResult.setBatchNumber(batchNumber); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java index efe073fc00dfdc..4b09bc00efb61a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java @@ -94,8 +94,7 @@ public UpdateGraphIndicesService( public void handleChangeEvent( @Nonnull OperationContext opContext, @Nonnull final MetadataChangeLog event) { try { - MCLItemImpl mclItem = - MCLItemImpl.builder().build(event, opContext.getAspectRetrieverOpt().get()); + MCLItemImpl mclItem = MCLItemImpl.builder().build(event, opContext.getAspectRetriever()); if (UPDATE_CHANGE_TYPES.contains(event.getChangeType())) { handleUpdateChangeEvent(opContext, mclItem); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java index 187ef3e8c62290..c5fc9ebdac9fa6 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java @@ -121,11 +121,10 @@ public UpdateIndicesService( public void handleChangeEvent( @Nonnull OperationContext opContext, @Nonnull final MetadataChangeLog event) { try { - MCLItemImpl batch = - MCLItemImpl.builder().build(event, opContext.getAspectRetrieverOpt().get()); + MCLItemImpl batch = MCLItemImpl.builder().build(event, opContext.getAspectRetriever()); Stream sideEffects = - AspectsBatch.applyMCLSideEffects(List.of(batch), opContext.getRetrieverContext().get()); + AspectsBatch.applyMCLSideEffects(List.of(batch), opContext.getRetrieverContext()); for (MCLItem mclItem : Stream.concat(Stream.of(batch), sideEffects).collect(Collectors.toList())) { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java index 12b12cf105196e..fa6ab7932001b6 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java @@ -46,12 +46,12 @@ public static Map ingestCorpUserKeyAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -83,12 +83,12 @@ public static Map ingestCorpUserInfoAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -121,12 +121,12 @@ public static Map ingestChartInfoAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java index 11a3153abcaeed..19be1eb14667d8 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java @@ -16,7 +16,8 @@ import com.linkedin.data.template.StringMap; import com.linkedin.dataset.DatasetProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; import com.linkedin.metadata.entity.SearchRetriever; @@ -28,7 +29,6 @@ import com.linkedin.mxe.SystemMetadata; import com.linkedin.test.metadata.aspect.TestEntityRegistry; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; import java.util.List; @@ -53,17 +53,17 @@ public class IgnoreUnknownMutatorTest { private static final Urn TEST_DATASET_URN = UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)"); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java b/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java index 04aff4edf456d9..e7ed2671131592 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java @@ -56,8 +56,7 @@ public void testAdditionalChanges() { DefaultAspectsUtil.getAdditionalChanges( opContext, AspectsBatchImpl.builder() - .mcps( - List.of(proposal1), new AuditStamp(), opContext.getRetrieverContext().get()) + .mcps(List.of(proposal1), new AuditStamp(), opContext.getRetrieverContext()) .build() .getMCPItems(), entityServiceImpl, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java index 976b165fea53df..215e1e2431efa0 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java @@ -15,7 +15,7 @@ import com.linkedin.dataproduct.DataProductAssociationArray; import com.linkedin.dataproduct.DataProductProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.aspect.batch.MCPItem; @@ -75,12 +75,12 @@ public class DataProductUnsetSideEffectTest { .build())) .build(); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); GraphRetriever graphRetriever = mock(GraphRetriever.class); RelatedEntities relatedEntities = @@ -139,7 +139,7 @@ public void setup() { retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .graphRetriever(graphRetriever) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java index 0386031cbcad86..88f84ee94c8ee7 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java @@ -19,6 +19,7 @@ import com.linkedin.metadata.AspectGenerationUtils; import com.linkedin.metadata.Constants; import com.linkedin.metadata.EbeanTestUtils; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.config.EbeanConfiguration; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.entity.ebean.EbeanAspectDao; @@ -98,12 +99,15 @@ public void setupTest() { .entityService(_entityServiceImpl) .entityRegistry(_testEntityRegistry) .build()) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> _testEntityRegistry)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, opContext -> - ((EntityServiceAspectRetriever) opContext.getAspectRetrieverOpt().get()) + ((EntityServiceAspectRetriever) opContext.getAspectRetriever()) .setSystemOperationContext(opContext), null); } @@ -152,25 +156,25 @@ public void testIngestListLatestAspects() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null))); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null))); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -230,25 +234,25 @@ public void testIngestListUrns() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null))); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null))); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -310,11 +314,11 @@ public void testSystemMetadataDuplicateKey() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(item)) .build(), false, @@ -356,7 +360,7 @@ public void testSystemMetadataDuplicateKey() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( List.of( ChangeItemImpl.builder() @@ -365,7 +369,7 @@ public void testSystemMetadataDuplicateKey() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)))) + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)))) .build(), false, true); @@ -600,7 +604,7 @@ public void run() { auditStamp.setTime(System.currentTimeMillis()); AspectsBatchImpl batch = AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, operationContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, operationContext.getRetrieverContext()) .build(); entityService.ingestProposal(operationContext, batch, false); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java index 2d59632e6f3c6d..c00632e5cf5424 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java @@ -945,32 +945,32 @@ public void testRollbackAspect() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1037,25 +1037,25 @@ public void testRollbackKey() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(keyAspectName) .recordTemplate(writeKey1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1130,39 +1130,39 @@ public void testRollbackUrn() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(keyAspectName) .recordTemplate(writeKey1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1208,11 +1208,11 @@ public void testIngestGetLatestAspect() throws AssertionError { .recordTemplate(writeAspect1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1264,11 +1264,11 @@ public void testIngestGetLatestAspect() throws AssertionError { .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata2) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1320,11 +1320,11 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception { .recordTemplate(writeAspect1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1347,11 +1347,11 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception { .recordTemplate(writeAspect2) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1416,11 +1416,11 @@ public void testIngestSameAspect() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1472,11 +1472,11 @@ public void testIngestSameAspect() throws AssertionError { .recordTemplate(writeAspect2) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1534,46 +1534,46 @@ public void testRetention() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName) .recordTemplate(writeAspect1a) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName) .recordTemplate(writeAspect1b) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2a) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2b) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1610,18 +1610,18 @@ public void testRetention() throws AssertionError { .recordTemplate(writeAspect1c) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2c) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1982,8 +1982,7 @@ public void testStructuredPropertyIngestProposal() throws Exception { stream .map( entityAspect -> - EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), entityAspect) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), entityAspect) .get() .getAspect(StructuredPropertyDefinition.class)) .collect(Collectors.toSet()); @@ -1995,7 +1994,10 @@ public void testStructuredPropertyIngestProposal() throws Exception { SystemEntityClient mockSystemEntityClient = Mockito.mock(SystemEntityClient.class); Mockito.when( mockSystemEntityClient.getLatestAspectObject( - any(OperationContext.class), eq(firstPropertyUrn), eq("propertyDefinition"))) + any(OperationContext.class), + eq(firstPropertyUrn), + eq("propertyDefinition"), + anyBoolean())) .thenReturn(new com.linkedin.entity.Aspect(structuredPropertyDefinition.data())); // Add a value for that property @@ -2062,8 +2064,7 @@ public void testStructuredPropertyIngestProposal() throws Exception { stream .map( entityAspect -> - EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), entityAspect) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), entityAspect) .get() .getAspect(StructuredPropertyDefinition.class)) .collect(Collectors.toSet()); @@ -2074,7 +2075,10 @@ public void testStructuredPropertyIngestProposal() throws Exception { Mockito.when( mockSystemEntityClient.getLatestAspectObject( - any(OperationContext.class), eq(secondPropertyUrn), eq("propertyDefinition"))) + any(OperationContext.class), + eq(secondPropertyUrn), + eq("propertyDefinition"), + anyBoolean())) .thenReturn(new com.linkedin.entity.Aspect(secondDefinition.data())); // Get existing value for first structured property @@ -2209,7 +2213,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); ChangeItemImpl item2 = ChangeItemImpl.builder() .urn(entityUrn) @@ -2217,11 +2221,11 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(item1, item2)) .build(), false, @@ -2269,7 +2273,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { .setTags(new TagAssociationArray(new TagAssociation().setTag(tag1)))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd2 = PatchItemImpl.builder() @@ -2311,7 +2315,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2320,7 +2324,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd2, patchRemoveNonExistent)) .build(), false, @@ -2368,7 +2372,7 @@ public void testBatchPatchAdd() throws Exception { .setTags(new TagAssociationArray(new TagAssociation().setTag(tag1)))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd3 = PatchItemImpl.builder() @@ -2428,7 +2432,7 @@ public void testBatchPatchAdd() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2437,7 +2441,7 @@ public void testBatchPatchAdd() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd3, patchAdd2, patchAdd1)) .build(), false, @@ -2491,7 +2495,7 @@ public void testBatchPatchAddDuplicate() throws Exception { .recordTemplate(new GlobalTags().setTags(new TagAssociationArray(initialTags))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd2 = PatchItemImpl.builder() @@ -2516,7 +2520,7 @@ public void testBatchPatchAddDuplicate() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2525,7 +2529,7 @@ public void testBatchPatchAddDuplicate() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd2, patchAdd2)) // duplicate .build(), false, @@ -2581,7 +2585,7 @@ public void testPatchRemoveNonExistent() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchRemove)) .build(), false, @@ -2638,7 +2642,7 @@ public void testPatchAddNonExistent() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd)) .build(), false, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java index 550f55e6bfd0b9..b4fbfecc9d60d3 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java @@ -10,11 +10,13 @@ import com.linkedin.metadata.AspectGenerationUtils; import com.linkedin.metadata.AspectIngestionUtils; import com.linkedin.metadata.CassandraTestUtils; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.entity.EntityServiceAspectRetriever; import com.linkedin.metadata.entity.EntityServiceImpl; import com.linkedin.metadata.entity.EntityServiceTest; import com.linkedin.metadata.entity.ListResult; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.key.CorpUserKey; import com.linkedin.metadata.models.registry.EntityRegistryException; @@ -93,12 +95,15 @@ private void configureComponents() { .entityService(_entityServiceImpl) .entityRegistry(_testEntityRegistry) .build()) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> _testEntityRegistry)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, opContext -> - ((EntityServiceAspectRetriever) opContext.getAspectRetrieverOpt().get()) + ((EntityServiceAspectRetriever) opContext.getAspectRetriever()) .setSystemOperationContext(opContext), null); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java index 3f6b301e72aa5a..0a867ae3c8f2e0 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java @@ -26,7 +26,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); ChangeItemImpl item2 = ChangeItemImpl.builder() .urn(entityUrn) @@ -34,7 +34,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); assertFalse(item1.isDatabaseDuplicateOf(item2)); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java index ca42f0327c86db..8f68f119cb0b7d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java @@ -11,6 +11,7 @@ import com.linkedin.metadata.recommendation.ranker.SimpleRecommendationRanker; import io.datahubproject.test.metadata.context.TestOperationContexts; import java.net.URISyntaxException; +import java.nio.file.AccessDeniedException; import java.util.List; import java.util.stream.Collectors; import org.testng.annotations.Test; @@ -74,7 +75,7 @@ private List getContentFromUrns(List urns) { } @Test - public void testService() throws URISyntaxException { + public void testService() throws URISyntaxException, AccessDeniedException { // Test non-eligible and empty RecommendationsService service = new RecommendationsService(ImmutableList.of(nonEligibleSource, emptySource), ranker); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java index 1661f5f02ee593..fa895cb4540117 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java @@ -21,7 +21,8 @@ import com.linkedin.data.ByteString; import com.linkedin.entity.Aspect; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCLItem; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; @@ -46,7 +47,6 @@ import com.linkedin.test.metadata.aspect.TestEntityRegistry; import com.linkedin.test.metadata.aspect.batch.TestMCP; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -87,18 +87,18 @@ public class SchemaFieldSideEffectTest { .build())) .build(); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java index fd768424e13c19..1825b65a18ab19 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java @@ -20,6 +20,7 @@ import com.linkedin.metadata.aspect.models.graph.RelatedEntities; import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.config.search.QueryFilterRewriterConfiguration; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; @@ -71,8 +72,10 @@ public void init() { () -> io.datahubproject.metadata.context.RetrieverContext.builder() .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever(() -> entityRegistry)) .graphRetriever(mockGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, null, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java index 8741e24b1bca50..de375271ed6602 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java @@ -13,13 +13,14 @@ import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.RetrieverContext; import com.linkedin.metadata.aspect.models.graph.Edge; import com.linkedin.metadata.aspect.models.graph.RelatedEntities; import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.config.search.QueryFilterRewriterConfiguration; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; @@ -54,7 +55,7 @@ public class DomainExpansionRewriterTest @BeforeMethod public void init() { EntityRegistry entityRegistry = new TestEntityRegistry(); - AspectRetriever mockAspectRetriever = mock(AspectRetriever.class); + CachingAspectRetriever mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(entityRegistry); mockGraphRetriever = spy(GraphRetriever.class); @@ -71,8 +72,10 @@ public void init() { () -> io.datahubproject.metadata.context.RetrieverContext.builder() .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever(() -> entityRegistry)) .graphRetriever(mockGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, null, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java index c68997e25bcff7..d6f5f9c3eedbe7 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java @@ -18,6 +18,7 @@ import com.linkedin.data.template.StringArray; import com.linkedin.entity.Aspect; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.models.annotation.SearchableAnnotation; @@ -49,8 +50,8 @@ public class AggregationQueryBuilderTest { - private static AspectRetriever aspectRetriever; - private static AspectRetriever aspectRetrieverV1; + private static CachingAspectRetriever aspectRetriever; + private static CachingAspectRetriever aspectRetrieverV1; private static String DEFAULT_FILTER = "_index"; @BeforeClass @@ -61,7 +62,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { Urn.createFromString("urn:li:structuredProperty:under.scores.and.dots_make_a_mess"); // legacy - aspectRetriever = mock(AspectRetriever.class); + aspectRetriever = mock(CachingAspectRetriever.class); when(aspectRetriever.getEntityRegistry()) .thenReturn(TestOperationContexts.defaultEntityRegistry()); @@ -106,7 +107,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { new Aspect(structPropUnderscoresAndDotsDefinition.data())))); // V1 - aspectRetrieverV1 = mock(AspectRetriever.class); + aspectRetrieverV1 = mock(CachingAspectRetriever.class); when(aspectRetrieverV1.getEntityRegistry()) .thenReturn(TestOperationContexts.defaultEntityRegistry()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java index 393ca3ca5d4a64..e51511699e345a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java @@ -662,6 +662,7 @@ public void testInvalidStructuredProperty() { TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever(TestOperationContexts.emptyActiveUsersAspectRetriever(null)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java index 2c5bcd1294fa15..65b73b7425b743 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java @@ -247,6 +247,9 @@ public void testSetSearchableRefValue() throws URISyntaxException, RemoteInvocat TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -301,6 +304,9 @@ public void testSetSearchableRefValue_RuntimeException() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -337,6 +343,9 @@ public void testSetSearchableRefValue_RuntimeException_URNExist() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -369,6 +378,9 @@ void testSetSearchableRefValue_WithInvalidURN() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java index b1b716c5604816..9a0a82c7f9f49d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java @@ -18,7 +18,8 @@ import com.linkedin.common.urn.UrnUtils; import com.linkedin.entity.Aspect; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.batch.PatchMCP; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; @@ -36,7 +37,6 @@ import com.linkedin.test.metadata.aspect.TestEntityRegistry; import com.linkedin.test.metadata.aspect.batch.TestMCL; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import jakarta.json.Json; import jakarta.json.JsonPatch; import java.util.List; @@ -76,13 +76,13 @@ public class PropertyDefinitionDeleteSideEffectTest { private static final Urn TEST_DATASET_URN = UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)"); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private SearchRetriever mockSearchRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); when(mockAspectRetriever.getLatestAspectObject( eq(TEST_PROPERTY_URN), eq(STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME))) @@ -101,8 +101,8 @@ public void setup() { retrieverContext = RetrieverContext.builder() .searchRetriever(mockSearchRetriever) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java index 2503faa00f6e71..6e8886f495c95a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java @@ -58,7 +58,7 @@ public void setup() { mockGraphRetriever = Mockito.mock(GraphRetriever.class); retrieverContext = io.datahubproject.metadata.context.RetrieverContext.builder() - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .searchRetriever(mockSearchRetriever) .graphRetriever(mockGraphRetriever) .build(); diff --git a/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java index 3acd2bf3413578..02cd28eb202e94 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java +++ b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java @@ -171,10 +171,7 @@ public Stream> generateMCPs( DefaultAspectsUtil.getAdditionalChanges( opContext, AspectsBatchImpl.builder() - .mcps( - List.of(mcp), - auditStamp, - opContext.getRetrieverContext().get()) + .mcps(List.of(mcp), auditStamp, opContext.getRetrieverContext()) .build() .getMCPItems(), entityService, diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java index cf9d73dfa729be..f16c9dbd82e749 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java @@ -20,7 +20,6 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.OperationContextConfig; -import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.ServicesRegistryContext; import io.datahubproject.metadata.context.ValidationContext; import io.datahubproject.test.metadata.context.TestOperationContexts; @@ -95,7 +94,7 @@ public OperationContext operationContext( entityRegistry, mock(ServicesRegistryContext.class), indexConvention, - mock(RetrieverContext.class), + TestOperationContexts.emptyActiveUsersRetrieverContext(() -> entityRegistry), mock(ValidationContext.class)); } diff --git a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java index 47740b02d6166c..65ee6b8591f489 100644 --- a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java +++ b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java @@ -93,8 +93,6 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { new RelatedEntity(BUSINESS_ATTRIBUTE_OF, SCHEMA_FIELD_URN.toString()))); when(opContext - .getRetrieverContext() - .get() .getAspectRetriever() .getLatestAspectObjects( eq(Set.of(SCHEMA_FIELD_URN)), eq(Set.of(BUSINESS_ATTRIBUTE_ASPECT)))) @@ -108,7 +106,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { // verify // page 1 - Mockito.verify(opContext.getRetrieverContext().get().getGraphRetriever(), Mockito.times(1)) + Mockito.verify(opContext.getRetrieverContext().getGraphRetriever(), Mockito.times(1)) .scrollRelatedEntities( isNull(), any(Filter.class), @@ -122,7 +120,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { isNull(), isNull()); // page 2 - Mockito.verify(opContext.getRetrieverContext().get().getGraphRetriever(), Mockito.times(1)) + Mockito.verify(opContext.getRetrieverContext().getGraphRetriever(), Mockito.times(1)) .scrollRelatedEntities( isNull(), any(Filter.class), @@ -136,7 +134,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { isNull(), isNull()); - Mockito.verifyNoMoreInteractions(opContext.getRetrieverContext().get().getGraphRetriever()); + Mockito.verifyNoMoreInteractions(opContext.getRetrieverContext().getGraphRetriever()); // 2 pages = 2 ingest proposals Mockito.verify(mockUpdateIndicesService, Mockito.times(2)) @@ -152,8 +150,8 @@ private void testMCLOnInvalidCategory() throws Exception { businessAttributeServiceHook.handleChangeEvent(opContext, platformEvent); // verify - Mockito.verifyNoInteractions(opContext.getRetrieverContext().get().getGraphRetriever()); - Mockito.verifyNoInteractions(opContext.getAspectRetrieverOpt().get()); + Mockito.verifyNoInteractions(opContext.getRetrieverContext().getGraphRetriever()); + Mockito.verifyNoInteractions(opContext.getAspectRetriever()); Mockito.verifyNoInteractions(mockUpdateIndicesService); } @@ -226,13 +224,15 @@ private OperationContext mockOperationContextWithGraph(List graph RetrieverContext mockRetrieverContext = mock(RetrieverContext.class); when(mockRetrieverContext.getAspectRetriever()).thenReturn(mock(AspectRetriever.class)); + when(mockRetrieverContext.getCachingAspectRetriever()) + .thenReturn(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); when(mockRetrieverContext.getGraphRetriever()).thenReturn(graphRetriever); OperationContext opContext = TestOperationContexts.systemContextNoSearchAuthorization(mockRetrieverContext); // reset mock for test - reset(opContext.getAspectRetrieverOpt().get()); + reset(opContext.getAspectRetriever()); if (!graphEdges.isEmpty()) { diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java index e65bf22991736d..c08b7fad4dee32 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java @@ -1,12 +1,23 @@ package io.datahubproject.metadata.context; +import static com.linkedin.metadata.Constants.CORP_USER_KEY_ASPECT_NAME; +import static com.linkedin.metadata.Constants.CORP_USER_STATUS_ASPECT_NAME; +import static com.linkedin.metadata.Constants.CORP_USER_STATUS_SUSPENDED; +import static com.linkedin.metadata.Constants.STATUS_ASPECT_NAME; +import static com.linkedin.metadata.Constants.SYSTEM_ACTOR; + import com.datahub.authentication.Authentication; +import com.linkedin.common.Status; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; +import com.linkedin.entity.Aspect; +import com.linkedin.identity.CorpUserStatus; +import com.linkedin.metadata.aspect.AspectRetriever; import com.linkedin.metadata.authorization.PoliciesConfig; import com.linkedin.policy.DataHubPolicyInfo; import java.util.Collection; import java.util.Collections; +import java.util.Map; import java.util.Optional; import java.util.Set; import lombok.Builder; @@ -48,6 +59,43 @@ public Urn getActorUrn() { return UrnUtils.getUrn(authentication.getActor().toUrnStr()); } + /** + * Actor is considered active if the user is not hard-deleted, soft-deleted, and is not suspended + * + * @param aspectRetriever aspect retriever - ideally the SystemEntityClient backed one for caching + * @return active status + */ + public boolean isActive(AspectRetriever aspectRetriever) { + // system cannot be disabled + if (SYSTEM_ACTOR.equals(authentication.getActor().toUrnStr())) { + return true; + } + + Urn selfUrn = UrnUtils.getUrn(authentication.getActor().toUrnStr()); + Map> urnAspectMap = + aspectRetriever.getLatestAspectObjects( + Set.of(selfUrn), + Set.of(STATUS_ASPECT_NAME, CORP_USER_STATUS_ASPECT_NAME, CORP_USER_KEY_ASPECT_NAME)); + + Map aspectMap = urnAspectMap.getOrDefault(selfUrn, Map.of()); + + if (!aspectMap.containsKey(CORP_USER_KEY_ASPECT_NAME)) { + // user is hard deleted + return false; + } + + Status status = + Optional.ofNullable(aspectMap.get(STATUS_ASPECT_NAME)) + .map(a -> new Status(a.data())) + .orElse(new Status().setRemoved(false)); + CorpUserStatus corpUserStatus = + Optional.ofNullable(aspectMap.get(CORP_USER_STATUS_ASPECT_NAME)) + .map(a -> new CorpUserStatus(a.data())) + .orElse(new CorpUserStatus().setStatus("")); + + return !status.isRemoved() && !CORP_USER_STATUS_SUSPENDED.equals(corpUserStatus.getStatus()); + } + /** * The current implementation creates a cache entry unique for the set of policies. * diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java index 9a058c526647c2..9158129235b39e 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java @@ -16,6 +16,8 @@ import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.utils.AuditStampUtils; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; +import io.datahubproject.metadata.exception.ActorAccessException; +import io.datahubproject.metadata.exception.OperationContextException; import java.util.Collection; import java.util.Objects; import java.util.Optional; @@ -63,6 +65,24 @@ public static OperationContext asSession( @Nonnull Authorizer authorizer, @Nonnull Authentication sessionAuthentication, boolean allowSystemAuthentication) { + return OperationContext.asSession( + systemOperationContext, + requestContext, + authorizer, + sessionAuthentication, + allowSystemAuthentication, + false); + } + + @Nonnull + public static OperationContext asSession( + OperationContext systemOperationContext, + @Nonnull RequestContext requestContext, + @Nonnull Authorizer authorizer, + @Nonnull Authentication sessionAuthentication, + boolean allowSystemAuthentication, + boolean skipCache) + throws ActorAccessException { return systemOperationContext.toBuilder() .operationContextConfig( // update allowed system authentication @@ -72,7 +92,7 @@ public static OperationContext asSession( .authorizationContext(AuthorizationContext.builder().authorizer(authorizer).build()) .requestContext(requestContext) .validationContext(systemOperationContext.getValidationContext()) - .build(sessionAuthentication); + .build(sessionAuthentication, skipCache); } /** @@ -85,10 +105,14 @@ public static OperationContext asSession( public static OperationContext withSearchFlags( OperationContext opContext, Function flagDefaults) { - return opContext.toBuilder() - // update search flags for the request's session - .searchContext(opContext.getSearchContext().withFlagDefaults(flagDefaults)) - .build(opContext.getSessionActorContext()); + try { + return opContext.toBuilder() + // update search flags for the request's session + .searchContext(opContext.getSearchContext().withFlagDefaults(flagDefaults)) + .build(opContext.getSessionActorContext(), false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } /** @@ -101,10 +125,14 @@ public static OperationContext withSearchFlags( public static OperationContext withLineageFlags( OperationContext opContext, Function flagDefaults) { - return opContext.toBuilder() - // update lineage flags for the request's session - .searchContext(opContext.getSearchContext().withLineageFlagDefaults(flagDefaults)) - .build(opContext.getSessionActorContext()); + try { + return opContext.toBuilder() + // update lineage flags for the request's session + .searchContext(opContext.getSearchContext().withLineageFlagDefaults(flagDefaults)) + .build(opContext.getSessionActorContext(), false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } /** @@ -155,18 +183,22 @@ public static OperationContext asSystem( ? SearchContext.EMPTY : SearchContext.builder().indexConvention(indexConvention).build(); - return OperationContext.builder() - .operationContextConfig(systemConfig) - .systemActorContext(systemActorContext) - .searchContext(systemSearchContext) - .entityRegistryContext(EntityRegistryContext.builder().build(entityRegistry)) - .servicesRegistryContext(servicesRegistryContext) - // Authorizer.EMPTY doesn't actually apply to system auth - .authorizationContext(AuthorizationContext.builder().authorizer(Authorizer.EMPTY).build()) - .retrieverContext(retrieverContext) - .objectMapperContext(objectMapperContext) - .validationContext(validationContext) - .build(systemAuthentication); + try { + return OperationContext.builder() + .operationContextConfig(systemConfig) + .systemActorContext(systemActorContext) + .searchContext(systemSearchContext) + .entityRegistryContext(EntityRegistryContext.builder().build(entityRegistry)) + .servicesRegistryContext(servicesRegistryContext) + // Authorizer.EMPTY doesn't actually apply to system auth + .authorizationContext(AuthorizationContext.builder().authorizer(Authorizer.EMPTY).build()) + .retrieverContext(retrieverContext) + .objectMapperContext(objectMapperContext) + .validationContext(validationContext) + .build(systemAuthentication, false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } @Nonnull private final OperationContextConfig operationContextConfig; @@ -177,7 +209,7 @@ public static OperationContext asSystem( @Nonnull private final EntityRegistryContext entityRegistryContext; @Nullable private final ServicesRegistryContext servicesRegistryContext; @Nullable private final RequestContext requestContext; - @Nullable private final RetrieverContext retrieverContext; + @Nonnull private final RetrieverContext retrieverContext; @Nonnull private final ObjectMapperContext objectMapperContext; @Nonnull private final ValidationContext validationContext; @@ -194,13 +226,15 @@ public OperationContext withLineageFlags( public OperationContext asSession( @Nonnull RequestContext requestContext, @Nonnull Authorizer authorizer, - @Nonnull Authentication sessionAuthentication) { + @Nonnull Authentication sessionAuthentication) + throws ActorAccessException { return OperationContext.asSession( this, requestContext, authorizer, sessionAuthentication, - getOperationContextConfig().isAllowSystemAuthentication()); + getOperationContextConfig().isAllowSystemAuthentication(), + false); } @Nonnull @@ -284,17 +318,9 @@ public AuditStamp getAuditStamp() { return getAuditStamp(null); } - public Optional getRetrieverContext() { - return Optional.ofNullable(retrieverContext); - } - - @Nullable + @Nonnull public AspectRetriever getAspectRetriever() { - return getAspectRetrieverOpt().orElse(null); - } - - public Optional getAspectRetrieverOpt() { - return getRetrieverContext().map(RetrieverContext::getAspectRetriever); + return retrieverContext.getAspectRetriever(); } /** @@ -336,10 +362,7 @@ public String getGlobalContextId() { ? EmptyContext.EMPTY : getServicesRegistryContext()) .add(getRequestContext() == null ? EmptyContext.EMPTY : getRequestContext()) - .add( - getRetrieverContext().isPresent() - ? getRetrieverContext().get() - : EmptyContext.EMPTY) + .add(getRetrieverContext()) .add(getObjectMapperContext()) .build() .stream() @@ -364,10 +387,7 @@ public String getSearchContextId() { getServicesRegistryContext() == null ? EmptyContext.EMPTY : getServicesRegistryContext()) - .add( - getRetrieverContext().isPresent() - ? getRetrieverContext().get() - : EmptyContext.EMPTY) + .add(getRetrieverContext()) .build() .stream() .map(ContextInterface::getCacheKeyComponent) @@ -438,6 +458,12 @@ public static class OperationContextBuilder { @Nonnull public OperationContext build(@Nonnull Authentication sessionAuthentication) { + return build(sessionAuthentication, false); + } + + @Nonnull + public OperationContext build( + @Nonnull Authentication sessionAuthentication, boolean skipCache) { final Urn actorUrn = UrnUtils.getUrn(sessionAuthentication.getActor().toUrnStr()); final ActorContext sessionActor = ActorContext.builder() @@ -451,11 +477,20 @@ public OperationContext build(@Nonnull Authentication sessionAuthentication) { .policyInfoSet(this.authorizationContext.getAuthorizer().getActorPolicies(actorUrn)) .groupMembership(this.authorizationContext.getAuthorizer().getActorGroups(actorUrn)) .build(); - return build(sessionActor); + return build(sessionActor, skipCache); } @Nonnull - public OperationContext build(@Nonnull ActorContext sessionActor) { + public OperationContext build(@Nonnull ActorContext sessionActor, boolean skipCache) { + AspectRetriever retriever = + skipCache + ? this.retrieverContext.getAspectRetriever() + : this.retrieverContext.getCachingAspectRetriever(); + + if (!sessionActor.isActive(retriever)) { + throw new ActorAccessException("Actor is not active"); + } + return new OperationContext( this.operationContextConfig, sessionActor, diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java index 9337fbfe3bb003..9afc4138810bb2 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java @@ -1,8 +1,10 @@ package io.datahubproject.metadata.context; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.entity.SearchRetriever; +import java.util.Objects; import java.util.Optional; import javax.annotation.Nonnull; import lombok.Builder; @@ -15,10 +17,37 @@ public class RetrieverContext @Nonnull private final GraphRetriever graphRetriever; @Nonnull private final AspectRetriever aspectRetriever; + @Nonnull private final CachingAspectRetriever cachingAspectRetriever; @Nonnull private final SearchRetriever searchRetriever; @Override public Optional getCacheKeyComponent() { return Optional.empty(); } + + public static class RetrieverContextBuilder { + public RetrieverContext build() { + if (this.aspectRetriever == null && this.cachingAspectRetriever != null) { + this.aspectRetriever = this.cachingAspectRetriever; + } + + if (this.cachingAspectRetriever == null + && this.aspectRetriever instanceof CachingAspectRetriever) { + this.cachingAspectRetriever = (CachingAspectRetriever) this.aspectRetriever; + } + + return new RetrieverContext( + this.graphRetriever, + Objects.requireNonNull(this.aspectRetriever), + Objects.requireNonNull(this.cachingAspectRetriever), + this.searchRetriever); + } + } + + public static final RetrieverContext EMPTY = + RetrieverContext.builder() + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) + .cachingAspectRetriever(CachingAspectRetriever.EMPTY) + .build(); } diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java new file mode 100644 index 00000000000000..bca2594b96430e --- /dev/null +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java @@ -0,0 +1,7 @@ +package io.datahubproject.metadata.exception; + +public class ActorAccessException extends OperationContextException { + public ActorAccessException(String string) { + super(string); + } +} diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java new file mode 100644 index 00000000000000..1aac8dc3e60ec9 --- /dev/null +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java @@ -0,0 +1,9 @@ +package io.datahubproject.metadata.exception; + +public class OperationContextException extends RuntimeException { + public OperationContextException(String message) { + super(message); + } + + public OperationContextException() {} +} diff --git a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java index 42de6b7398c616..4abfbb196f067c 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java @@ -8,21 +8,17 @@ import com.linkedin.common.urn.Urn; import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.entity.Aspect; +import com.linkedin.identity.CorpUserInfo; +import com.linkedin.metadata.Constants; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; -import com.linkedin.metadata.aspect.SystemAspect; -import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistryException; import com.linkedin.metadata.models.registry.MergedEntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; -import com.linkedin.metadata.query.filter.Filter; -import com.linkedin.metadata.query.filter.RelationshipFilter; -import com.linkedin.metadata.query.filter.SortCriterion; -import com.linkedin.metadata.search.ScrollResult; -import com.linkedin.metadata.search.SearchEntityArray; import com.linkedin.metadata.snapshot.Snapshot; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; @@ -32,15 +28,14 @@ import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.ServicesRegistryContext; import io.datahubproject.metadata.context.ValidationContext; -import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.function.Consumer; import java.util.function.Supplier; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.Builder; /** * Useful for testing. If the defaults are not sufficient, try using the .toBuilder() and replacing @@ -81,26 +76,53 @@ public static EntityRegistry defaultEntityRegistry() { return defaultEntityRegistryInstance; } - public static AspectRetriever emptyAspectRetriever( + public static RetrieverContext emptyActiveUsersRetrieverContext( @Nullable Supplier entityRegistrySupplier) { - return new EmptyAspectRetriever( - () -> - Optional.ofNullable(entityRegistrySupplier) - .map(Supplier::get) - .orElse(defaultEntityRegistry())); - } - public static GraphRetriever emptyGraphRetriever = new EmptyGraphRetriever(); - public static SearchRetriever emptySearchRetriever = new EmptySearchRetriever(); + return RetrieverContext.builder() + .cachingAspectRetriever(emptyActiveUsersAspectRetriever(entityRegistrySupplier)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) + .build(); + } - public static RetrieverContext emptyRetrieverContext( + public static CachingAspectRetriever emptyActiveUsersAspectRetriever( @Nullable Supplier entityRegistrySupplier) { - return RetrieverContext.builder() - .aspectRetriever(emptyAspectRetriever(entityRegistrySupplier)) - .graphRetriever(emptyGraphRetriever) - .searchRetriever(emptySearchRetriever) - .build(); + return new CachingAspectRetriever.EmptyAspectRetriever() { + + @Nonnull + @Override + public Map> getLatestAspectObjects( + Set urns, Set aspectNames) { + if (urns.stream().allMatch(urn -> urn.toString().startsWith("urn:li:corpuser:")) + && aspectNames.contains(Constants.CORP_USER_KEY_ASPECT_NAME)) { + return urns.stream() + .map( + urn -> + Map.entry( + urn, + Map.of( + Constants.CORP_USER_KEY_ASPECT_NAME, + new Aspect( + new CorpUserInfo() + .setActive(true) + .setEmail(urn.getId()) + .setDisplayName(urn.getId()) + .data())))) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + return super.getLatestAspectObjects(urns, aspectNames); + } + + @Nonnull + @Override + public EntityRegistry getEntityRegistry() { + return Optional.ofNullable(entityRegistrySupplier) + .map(Supplier::get) + .orElse(defaultEntityRegistry()); + } + }; } public static OperationContext systemContextNoSearchAuthorization( @@ -140,8 +162,10 @@ public static OperationContext systemContextNoSearchAuthorization( RetrieverContext retrieverContext = RetrieverContext.builder() .aspectRetriever(aspectRetriever) - .graphRetriever(emptyGraphRetriever) - .searchRetriever(emptySearchRetriever) + .cachingAspectRetriever( + emptyActiveUsersAspectRetriever(() -> aspectRetriever.getEntityRegistry())) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(); return systemContextNoSearchAuthorization( () -> retrieverContext.getAspectRetriever().getEntityRegistry(), @@ -208,7 +232,7 @@ public static OperationContext systemContext( RetrieverContext retrieverContext = Optional.ofNullable(retrieverContextSupplier) .map(Supplier::get) - .orElse(emptyRetrieverContext(entityRegistrySupplier)); + .orElse(emptyActiveUsersRetrieverContext(entityRegistrySupplier)); EntityRegistry entityRegistry = Optional.ofNullable(entityRegistrySupplier) @@ -298,66 +322,5 @@ public static OperationContext userContextNoSearchAuthorization( .asSession(requestContext, Authorizer.EMPTY, TEST_USER_AUTH); } - @Builder - public static class EmptyAspectRetriever implements AspectRetriever { - private final Supplier entityRegistrySupplier; - - @Nonnull - @Override - public Map> getLatestAspectObjects( - Set urns, Set aspectNames) { - return Map.of(); - } - - @Nonnull - @Override - public Map> getLatestSystemAspects( - Map> urnAspectNames) { - return Map.of(); - } - - @Nonnull - @Override - public EntityRegistry getEntityRegistry() { - return entityRegistrySupplier.get(); - } - } - - public static class EmptyGraphRetriever implements GraphRetriever { - - @Nonnull - @Override - public RelatedEntitiesScrollResult scrollRelatedEntities( - @Nullable List sourceTypes, - @Nonnull Filter sourceEntityFilter, - @Nullable List destinationTypes, - @Nonnull Filter destinationEntityFilter, - @Nonnull List relationshipTypes, - @Nonnull RelationshipFilter relationshipFilter, - @Nonnull List sortCriterion, - @Nullable String scrollId, - int count, - @Nullable Long startTimeMillis, - @Nullable Long endTimeMillis) { - return new RelatedEntitiesScrollResult(0, 0, null, List.of()); - } - } - - public static class EmptySearchRetriever implements SearchRetriever { - - @Override - public ScrollResult scroll( - @Nonnull List entities, - @Nullable Filter filters, - @Nullable String scrollId, - int count) { - ScrollResult empty = new ScrollResult(); - empty.setEntities(new SearchEntityArray()); - empty.setNumEntities(0); - empty.setPageSize(0); - return empty; - } - } - private TestOperationContexts() {} } diff --git a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java index 3e092e20127ee5..f77b244d8f2d86 100644 --- a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java +++ b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java @@ -8,6 +8,7 @@ import com.datahub.authentication.Authentication; import com.datahub.plugins.auth.authorization.Authorizer; import com.linkedin.metadata.models.registry.EntityRegistry; +import io.datahubproject.test.metadata.context.TestOperationContexts; import org.testng.annotations.Test; public class OperationContextTest { @@ -25,7 +26,7 @@ public void testSystemPrivilegeEscalation() { mock(EntityRegistry.class), mock(ServicesRegistryContext.class), null, - mock(RetrieverContext.class), + TestOperationContexts.emptyActiveUsersRetrieverContext(null), mock(ValidationContext.class)); OperationContext opContext = diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java index 6724f35d840adb..a9871f1ed99482 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java @@ -145,7 +145,7 @@ public String generateAccessToken( _entityService.ingestProposal( systemOperationContext, AspectsBatchImpl.builder() - .mcps(List.of(proposal), auditStamp, systemOperationContext.getRetrieverContext().get()) + .mcps(List.of(proposal), auditStamp, systemOperationContext.getRetrieverContext()) .build(), false); diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 9348416606d0a9..75b4c8e8b002f9 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -522,12 +522,12 @@ cache: entityAspectTTLSeconds: # cache user aspects for 20s corpuser: - corpUserKey: 20 + corpUserKey: 300 # 5 min corpUserInfo: 20 corpUserEditableInfo: 20 - corpUserStatus: 20 + corpUserStatus: 300 # 5 min globalTags: 20 - status: 20 + status: 300 # 5 min corpUserCredentials: 20 corpUserSettings: 20 roleMembership: 20 diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java index f5235dc3682fce..3e2823591e168c 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java @@ -45,7 +45,8 @@ protected OperationContext javaSystemOperationContext( @Nonnull final SearchService searchService, @Qualifier("baseElasticSearchComponents") BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components, - @Nonnull final ConfigurationProvider configurationProvider) { + @Nonnull final ConfigurationProvider configurationProvider, + @Qualifier("systemEntityClient") @Nonnull final SystemEntityClient systemEntityClient) { EntityServiceAspectRetriever entityServiceAspectRetriever = EntityServiceAspectRetriever.builder() @@ -53,6 +54,9 @@ protected OperationContext javaSystemOperationContext( .entityService(entityService) .build(); + EntityClientAspectRetriever entityClientAspectRetriever = + EntityClientAspectRetriever.builder().entityClient(systemEntityClient).build(); + SystemGraphRetriever systemGraphRetriever = SystemGraphRetriever.builder().graphService(graphService).build(); @@ -68,6 +72,7 @@ protected OperationContext javaSystemOperationContext( components.getIndexConvention(), RetrieverContext.builder() .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(entityClientAspectRetriever) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), @@ -76,6 +81,7 @@ protected OperationContext javaSystemOperationContext( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) .build()); + entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext); systemGraphRetriever.setSystemOperationContext(systemOperationContext); searchServiceSearchRetriever.setSystemOperationContext(systemOperationContext); @@ -104,7 +110,7 @@ protected OperationContext restliSystemOperationContext( BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components, @Nonnull final ConfigurationProvider configurationProvider) { - EntityClientAspectRetriever entityServiceAspectRetriever = + EntityClientAspectRetriever entityClientAspectRetriever = EntityClientAspectRetriever.builder().entityClient(systemEntityClient).build(); SystemGraphRetriever systemGraphRetriever = @@ -121,7 +127,7 @@ protected OperationContext restliSystemOperationContext( ServicesRegistryContext.builder().restrictedService(restrictedService).build(), components.getIndexConvention(), RetrieverContext.builder() - .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(entityClientAspectRetriever) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), @@ -130,7 +136,7 @@ protected OperationContext restliSystemOperationContext( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) .build()); - entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext); + entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); systemGraphRetriever.setSystemOperationContext(systemOperationContext); searchServiceSearchRetriever.setSystemOperationContext(systemOperationContext); diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java index 22ce06a5984ea6..c04dd25ccd4ac9 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java @@ -84,14 +84,14 @@ public void execute(@Nonnull OperationContext systemOperationContext) throws Exc .aspectName(DATA_PLATFORM_INSTANCE_ASPECT_NAME) .recordTemplate(dataPlatformInstance.get()) .auditStamp(aspectAuditStamp) - .build(systemOperationContext.getAspectRetrieverOpt().get())); + .build(systemOperationContext.getAspectRetriever())); } } _entityService.ingestAspects( systemOperationContext, AspectsBatchImpl.builder() - .retrieverContext(systemOperationContext.getRetrieverContext().get()) + .retrieverContext(systemOperationContext.getRetrieverContext()) .items(items) .build(), true, diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java index eb6bfe17ac198e..dac2879487469c 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java @@ -225,7 +225,7 @@ private void ingestPolicy( new AuditStamp() .setActor(Urn.createFromString(Constants.SYSTEM_ACTOR)) .setTime(System.currentTimeMillis()), - systemOperationContext.getRetrieverContext().get()) + systemOperationContext.getRetrieverContext()) .build(), false); } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java similarity index 81% rename from metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java rename to metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java index ba0a426fa20e89..c756827cad56ba 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java @@ -1,9 +1,11 @@ -package io.datahubproject.openapi; +package io.datahubproject.openapi.config; import com.linkedin.metadata.dao.throttle.APIThrottleException; +import io.datahubproject.metadata.exception.ActorAccessException; import io.datahubproject.openapi.exception.InvalidUrnException; import io.datahubproject.openapi.exception.UnauthorizedException; import java.util.Map; +import javax.annotation.PostConstruct; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.ConversionNotSupportedException; import org.springframework.core.Ordered; @@ -19,6 +21,11 @@ @ControllerAdvice public class GlobalControllerExceptionHandler extends DefaultHandlerExceptionResolver { + @PostConstruct + public void init() { + log.info("GlobalControllerExceptionHandler initialized"); + } + public GlobalControllerExceptionHandler() { setOrder(Ordered.HIGHEST_PRECEDENCE); setWarnLogCategory(getClass().getName()); @@ -52,4 +59,9 @@ public static ResponseEntity> handleUnauthorizedException( UnauthorizedException e) { return new ResponseEntity<>(Map.of("error", e.getMessage()), HttpStatus.FORBIDDEN); } + + @ExceptionHandler(ActorAccessException.class) + public static ResponseEntity> actorAccessException(ActorAccessException e) { + return new ResponseEntity<>(Map.of("error", e.getMessage()), HttpStatus.FORBIDDEN); + } } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java index 579a62c084999a..592d7bba4211fe 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java @@ -637,7 +637,7 @@ public ResponseEntity createAspect( AspectSpec aspectSpec = lookupAspectSpec(entitySpec, aspectName).get(); ChangeMCP upsert = toUpsertItem( - opContext.getRetrieverContext().get().getAspectRetriever(), + opContext.getRetrieverContext().getAspectRetriever(), urn, aspectSpec, createIfEntityNotExists, @@ -649,7 +649,7 @@ public ResponseEntity createAspect( entityService.ingestProposal( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(upsert)) .build(), async); @@ -725,7 +725,7 @@ public ResponseEntity patchAspect( .build(); ChangeMCP upsert = toUpsertItem( - opContext.getRetrieverContext().get().getAspectRetriever(), + opContext.getRetrieverContext().getAspectRetriever(), validatedUrn(entityUrn), aspectSpec, currentValue, @@ -736,7 +736,7 @@ public ResponseEntity patchAspect( entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(upsert)) .build(), true, diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java new file mode 100644 index 00000000000000..99d3879ab9a320 --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java @@ -0,0 +1,54 @@ +package io.datahubproject.openapi.operations.test; + +import com.datahub.authentication.Authentication; +import com.datahub.authentication.AuthenticationContext; +import com.datahub.authorization.AuthorizerChain; +import io.datahubproject.metadata.context.OperationContext; +import io.datahubproject.metadata.context.RequestContext; +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.tags.Tag; +import jakarta.servlet.http.HttpServletRequest; +import java.util.List; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; + +@RestController +@RequestMapping("/operations/identity") +@Slf4j +@Tag(name = "Identity", description = "An API for checking identity") +public class IdController { + private final AuthorizerChain authorizerChain; + private final OperationContext systemOperationContext; + + public IdController(OperationContext systemOperationContext, AuthorizerChain authorizerChain) { + this.systemOperationContext = systemOperationContext; + this.authorizerChain = authorizerChain; + } + + @Tag(name = "User") + @GetMapping(path = "/user/urn", produces = MediaType.APPLICATION_JSON_VALUE) + @Operation(summary = "User id") + public ResponseEntity> getUserId( + HttpServletRequest request, + @RequestParam(value = "skipCache", required = false, defaultValue = "false") + Boolean skipCache) { + Authentication authentication = AuthenticationContext.getAuthentication(); + String actorUrnStr = authentication.getActor().toUrnStr(); + + OperationContext.asSession( + systemOperationContext, + RequestContext.builder().buildOpenapi(actorUrnStr, request, "getUserIdentity", List.of()), + authorizerChain, + authentication, + true, + skipCache); + + return ResponseEntity.ok(Map.of("urn", actorUrnStr)); + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java index c38f2db0eefbb3..ca425810c87a09 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java @@ -491,7 +491,7 @@ public static List> ingestBatchProposal( try { AspectsBatch batch = AspectsBatchImpl.builder() - .mcps(serviceProposals, auditStamp, opContext.getRetrieverContext().get()) + .mcps(serviceProposals, auditStamp, opContext.getRetrieverContext()) .build(); Map> resultMap = diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java index 56a7955b9fe871..b1c5709ef01470 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java @@ -203,7 +203,7 @@ protected AspectsBatch toMCPBatch( objectMapper.writeValueAsString(aspect.getValue().get("systemMetadata")))); } - items.add(builder.build(opContext.getAspectRetrieverOpt().get())); + items.add(builder.build(opContext.getAspectRetriever())); } } } @@ -211,7 +211,7 @@ protected AspectsBatch toMCPBatch( return AspectsBatchImpl.builder() .items(items) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java index ce7fd73f99b9e5..af13cd3aab0510 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java @@ -554,14 +554,14 @@ protected AspectsBatch toMCPBatch( GenericRecordUtils.JSON, aspectSpec)); - items.add(builder.build(opContext.getRetrieverContext().get().getAspectRetriever())); + items.add(builder.build(opContext.getRetrieverContext().getAspectRetriever())); } } } } return AspectsBatchImpl.builder() .items(items) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json index 33cfba0f27802c..27731af9ffaa71 100644 --- a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json +++ b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json @@ -19,6 +19,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] }, { "method" : "batch_get", @@ -27,6 +31,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] } ], "entity" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json index 9bf7f97b34be18..9c5f41281fcfbe 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json @@ -182,6 +182,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] }, { "method" : "batch_get", @@ -190,6 +194,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] } ], "entity" : { diff --git a/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java index cf6e571cb8cbeb..b85f22e781d0b0 100644 --- a/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java +++ b/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java @@ -45,12 +45,34 @@ // Consider renaming this to datahub client. public interface EntityClient { + /** + * This version follows the legacy behavior of returning key aspects regardless of whether they + * exist + * + * @param opContext operation context + * @param entityName entity type + * @param urn urn id for the entity + * @param aspectNames set of aspects + * @return requested entity/aspects + */ + @Deprecated @Nullable - EntityResponse getV2( + default EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, @Nullable final Set aspectNames) + throws RemoteInvocationException, URISyntaxException { + return getV2(opContext, entityName, urn, aspectNames, true); + } + + @Nullable + EntityResponse getV2( + @Nonnull OperationContext opContext, + @Nonnull String entityName, + @Nonnull final Urn urn, + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException; @Nonnull @@ -58,12 +80,34 @@ EntityResponse getV2( Entity get(@Nonnull OperationContext opContext, @Nonnull final Urn urn) throws RemoteInvocationException; + /** + * This version follows the legacy behavior of returning key aspects regardless of whether they + * exist + * + * @param opContext operation context + * @param entityName entity type + * @param urns urn ids for the entities + * @param aspectNames set of aspects + * @return requested entity/aspects + */ + @Deprecated @Nonnull - Map batchGetV2( + default Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Set urns, @Nullable final Set aspectNames) + throws RemoteInvocationException, URISyntaxException { + return batchGetV2(opContext, entityName, urns, aspectNames, true); + } + + @Nonnull + Map batchGetV2( + @Nonnull OperationContext opContext, + @Nonnull String entityName, + @Nonnull final Set urns, + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException; @Nonnull @@ -589,27 +633,38 @@ void rollbackIngestion( @Nullable default Aspect getLatestAspectObject( - @Nonnull OperationContext opContext, @Nonnull Urn urn, @Nonnull String aspectName) + @Nonnull OperationContext opContext, + @Nonnull Urn urn, + @Nonnull String aspectName, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { - return getLatestAspects(opContext, Set.of(urn), Set.of(aspectName)) + return getLatestAspects(opContext, Set.of(urn), Set.of(aspectName), alwaysIncludeKeyAspect) .getOrDefault(urn, Map.of()) .get(aspectName); } @Nonnull default Map> getLatestAspects( - @Nonnull OperationContext opContext, @Nonnull Set urns, @Nonnull Set aspectNames) + @Nonnull OperationContext opContext, + @Nonnull Set urns, + @Nonnull Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); - return entityResponseToAspectMap(batchGetV2(opContext, entityName, urns, aspectNames)); + return entityResponseToAspectMap( + batchGetV2(opContext, entityName, urns, aspectNames, alwaysIncludeKeyAspect)); } @Nonnull default Map> getLatestSystemAspect( - @Nonnull OperationContext opContext, @Nonnull Set urns, @Nonnull Set aspectNames) + @Nonnull OperationContext opContext, + @Nonnull Set urns, + @Nonnull Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); return entityResponseToSystemAspectMap( - batchGetV2(opContext, entityName, urns, aspectNames), opContext.getEntityRegistry()); + batchGetV2(opContext, entityName, urns, aspectNames, alwaysIncludeKeyAspect), + opContext.getEntityRegistry()); } } diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java index 516902601f08a1..8d4c5e9228a71c 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java @@ -156,10 +156,15 @@ public EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final EntitiesV2GetRequestBuilder requestBuilder = - ENTITIES_V2_REQUEST_BUILDERS.get().aspectsParam(aspectNames).id(urn.toString()); + ENTITIES_V2_REQUEST_BUILDERS + .get() + .aspectsParam(aspectNames) + .id(urn.toString()) + .alwaysIncludeKeyAspectParam(alwaysIncludeKeyAspect); return sendClientRequest(requestBuilder, opContext.getSessionAuthentication()).getEntity(); } @@ -241,7 +246,8 @@ public Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Set urns, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { Map responseMap = new HashMap<>(); @@ -260,6 +266,7 @@ public Map batchGetV2( ENTITIES_V2_REQUEST_BUILDERS .batchGet() .aspectsParam(aspectNames) + .alwaysIncludeKeyAspectParam(alwaysIncludeKeyAspect) .ids( batch.stream() .map(Urn::toString) diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java index 2637e2d067c6d5..aa17f1951bc912 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java @@ -59,6 +59,6 @@ public Map batchGetV2NoCache( @Nonnull Set urns, @Nullable Set aspectNames) throws RemoteInvocationException, URISyntaxException { - return super.batchGetV2(opContext, entityName, urns, aspectNames); + return super.batchGetV2(opContext, entityName, urns, aspectNames, false); } } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java index 6033ead36f10ec..30b187da00e91a 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java @@ -309,7 +309,7 @@ private Task ingestProposals( log.debug("Proposals: {}", metadataChangeProposals); try { final AspectsBatch batch = AspectsBatchImpl.builder() - .mcps(metadataChangeProposals, auditStamp, opContext.getRetrieverContext().get(), + .mcps(metadataChangeProposals, auditStamp, opContext.getRetrieverContext(), opContext.getValidationContext().isAlternateValidation()) .build(); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java index 20209ddf44d643..896d81d3cbecc3 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java @@ -64,7 +64,8 @@ public class EntityV2Resource extends CollectionResourceTaskTemplate get( - @Nonnull String urnStr, @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames) + @Nonnull String urnStr, @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames, + @QueryParam(PARAM_ALWAYS_INCLUDE_KEY_ASPECT) @Optional @Nullable Boolean alwaysIncludeKeyAspect) throws URISyntaxException { log.debug("GET V2 {}", urnStr); final Urn urn = Urn.createFromString(urnStr); @@ -90,7 +91,7 @@ public Task get( ? opContext.getEntityAspectNames(entityName) : new HashSet<>(Arrays.asList(aspectNames)); try { - return _entityService.getEntityV2(opContext, entityName, urn, projectedAspects); + return _entityService.getEntityV2(opContext, entityName, urn, projectedAspects, alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } catch (Exception e) { throw new RuntimeException( String.format( @@ -106,7 +107,8 @@ public Task get( @WithSpan public Task> batchGet( @Nonnull Set urnStrs, - @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames) + @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames, + @QueryParam(PARAM_ALWAYS_INCLUDE_KEY_ASPECT) @Optional @Nullable Boolean alwaysIncludeKeyAspect) throws URISyntaxException { log.debug("BATCH GET V2 {}", urnStrs.toString()); final Set urns = new HashSet<>(); @@ -138,7 +140,7 @@ public Task> batchGet( ? opContext.getEntityAspectNames(entityName) : new HashSet<>(Arrays.asList(aspectNames)); try { - return _entityService.getEntitiesV2(opContext, entityName, urns, projectedAspects); + return _entityService.getEntitiesV2(opContext, entityName, urns, projectedAspects, alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } catch (Exception e) { throw new RuntimeException( String.format( diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java index ef79a404c2145e..11df52ad66709e 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java @@ -1,5 +1,7 @@ package com.linkedin.metadata.resources.restli; +import javax.annotation.Nullable; + public final class RestliConstants { private RestliConstants() {} @@ -21,6 +23,7 @@ private RestliConstants() {} public static final String PARAM_INPUT = "input"; public static final String PARAM_MAX_HOPS = "maxHops"; public static final String PARAM_ASPECTS = "aspects"; + public static final String PARAM_ALWAYS_INCLUDE_KEY_ASPECT = "alwaysIncludeKeyAspect"; public static final String PARAM_FILTER = "filter"; public static final String PARAM_GROUP = "group"; public static final String PARAM_SORT = "sort"; diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java index 185874fac1382d..a2092405da3ff6 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java @@ -8,6 +8,7 @@ import com.linkedin.parseq.Task; import com.linkedin.restli.common.HttpStatus; import com.linkedin.restli.server.RestLiServiceException; +import io.datahubproject.metadata.exception.ActorAccessException; import java.util.Optional; import java.util.function.Supplier; import javax.annotation.Nonnull; @@ -38,6 +39,8 @@ public static Task toTask(@Nonnull Supplier supplier) { if (throwable instanceof IllegalArgumentException || throwable.getCause() instanceof IllegalArgumentException) { finalException = badRequestException(throwable.getMessage()); + } else if (throwable.getCause() instanceof ActorAccessException) { + finalException = forbidden(throwable.getCause().getMessage()); } else if (throwable instanceof APIThrottleException) { finalException = apiThrottled(throwable.getMessage()); } else if (throwable instanceof RestLiServiceException) { @@ -109,4 +112,9 @@ public static RestLiServiceException invalidArgumentsException(@Nullable String public static RestLiServiceException apiThrottled(@Nullable String message) { return new RestLiServiceException(HttpStatus.S_429_TOO_MANY_REQUESTS, message); } + + @Nonnull + public static RestLiServiceException forbidden(@Nullable String message) { + return new RestLiServiceException(HttpStatus.S_403_FORBIDDEN, message); + } } diff --git a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java index a39401c170a114..037b5b81fd4df0 100644 --- a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java +++ b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java @@ -100,7 +100,7 @@ public void testAsyncDefaultAspects() throws URISyntaxException { .recordTemplate(mcp.getAspect()) .auditStamp(new AuditStamp()) .metadataChangeProposal(mcp) - .build(opContext.getAspectRetrieverOpt().get()); + .build(opContext.getAspectRetriever()); when(aspectDao.runInTransactionWithRetry(any(), any(), anyInt())) .thenReturn( List.of(List.of( diff --git a/smoke-test/tests/tokens/revokable_access_token_test.py b/smoke-test/tests/tokens/revokable_access_token_test.py index af29437c051e19..006daae39333ed 100644 --- a/smoke-test/tests/tokens/revokable_access_token_test.py +++ b/smoke-test/tests/tokens/revokable_access_token_test.py @@ -9,6 +9,8 @@ wait_for_writes_to_sync, ) +from .token_utils import listUsers, removeUser + pytestmark = pytest.mark.no_cypress_suite1 # Disable telemetry @@ -490,45 +492,3 @@ def getAccessTokenMetadata(session, token): response.raise_for_status() return response.json() - - -def removeUser(session, urn): - # Remove user - json = { - "query": """mutation removeUser($urn: String!) { - removeUser(urn: $urn) - }""", - "variables": {"urn": urn}, - } - - response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) - - response.raise_for_status() - return response.json() - - -def listUsers(session): - input = { - "start": "0", - "count": "20", - } - - # list users - json = { - "query": """query listUsers($input: ListUsersInput!) { - listUsers(input: $input) { - start - count - total - users { - username - } - } - }""", - "variables": {"input": input}, - } - - response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) - - response.raise_for_status() - return response.json() diff --git a/smoke-test/tests/tokens/session_access_token_test.py b/smoke-test/tests/tokens/session_access_token_test.py new file mode 100644 index 00000000000000..a16abc44453036 --- /dev/null +++ b/smoke-test/tests/tokens/session_access_token_test.py @@ -0,0 +1,173 @@ +import os +import time + +import pytest +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import AuditStampClass, CorpUserStatusClass +from requests.exceptions import HTTPError + +from tests.utils import ( + get_admin_credentials, + get_frontend_url, + login_as, + wait_for_writes_to_sync, +) + +from .token_utils import getUserId, listUsers, removeUser + +pytestmark = pytest.mark.no_cypress_suite1 + +# Disable telemetry +os.environ["DATAHUB_TELEMETRY_ENABLED"] = "false" + +(admin_user, admin_pass) = get_admin_credentials() +user_urn = "urn:li:corpuser:sessionUser" + + +@pytest.fixture(scope="class") +def custom_user_session(): + """Fixture to execute setup before and tear down after all tests are run""" + admin_session = login_as(admin_user, admin_pass) + + res_data = removeUser(admin_session, user_urn) + assert res_data + assert "error" not in res_data + + # Test getting the invite token + get_invite_token_json = { + "query": """query getInviteToken($input: GetInviteTokenInput!) { + getInviteToken(input: $input){ + inviteToken + } + }""", + "variables": {"input": {}}, + } + + get_invite_token_response = admin_session.post( + f"{get_frontend_url()}/api/v2/graphql", json=get_invite_token_json + ) + get_invite_token_response.raise_for_status() + get_invite_token_res_data = get_invite_token_response.json() + + assert get_invite_token_res_data + assert get_invite_token_res_data["data"] + invite_token = get_invite_token_res_data["data"]["getInviteToken"]["inviteToken"] + assert invite_token is not None + assert "error" not in invite_token + + # Pass the invite token when creating the user + sign_up_json = { + "fullName": "Test Session User", + "email": "sessionUser", + "password": "sessionUser", + "title": "Date Engineer", + "inviteToken": invite_token, + } + + sign_up_response = admin_session.post( + f"{get_frontend_url()}/signUp", json=sign_up_json + ) + sign_up_response.raise_for_status() + assert sign_up_response + assert "error" not in sign_up_response + # Sleep for eventual consistency + wait_for_writes_to_sync() + + # signUp will override the session cookie to the new user to be signed up. + admin_session.cookies.clear() + admin_session = login_as(admin_user, admin_pass) + + # Make user created user is there. + res_data = listUsers(admin_session) + assert res_data["data"] + assert res_data["data"]["listUsers"] + assert {"username": "sessionUser"} in res_data["data"]["listUsers"]["users"] + + yield login_as(sign_up_json["email"], sign_up_json["password"]) + + # Delete created user + res_data = removeUser(admin_session, user_urn) + assert res_data + assert res_data["data"] + assert res_data["data"]["removeUser"] is True + # Sleep for eventual consistency + wait_for_writes_to_sync() + + # Make user created user is not there. + res_data = listUsers(admin_session) + assert res_data["data"] + assert res_data["data"]["listUsers"] + assert {"username": "sessionUser"} not in res_data["data"]["listUsers"]["users"] + + +@pytest.mark.dependency() +def test_soft_delete(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.soft_delete_entity(urn=user_urn) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) + + # undo soft delete + graph_client.set_soft_delete_status(urn=user_urn, delete=False) + wait_for_writes_to_sync() + + +@pytest.mark.dependency(depends=["test_soft_delete"]) +def test_suspend(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.emit( + MetadataChangeProposalWrapper( + entityType="corpuser", + entityUrn=user_urn, + changeType="UPSERT", + aspectName="corpUserStatus", + aspect=CorpUserStatusClass( + status="SUSPENDED", + lastModified=AuditStampClass( + time=int(time.time() * 1000.0), actor="urn:li:corpuser:unknown" + ), + ), + ) + ) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) + + # undo suspend + graph_client.emit( + MetadataChangeProposalWrapper( + entityType="corpuser", + entityUrn=user_urn, + changeType="UPSERT", + aspectName="corpUserStatus", + aspect=CorpUserStatusClass( + status="ACTIVE", + lastModified=AuditStampClass( + time=int(time.time() * 1000.0), actor="urn:li:corpuser:unknown" + ), + ), + ) + ) + wait_for_writes_to_sync() + + +@pytest.mark.dependency(depends=["test_suspend"]) +def test_hard_delete(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.hard_delete_entity(urn=user_urn) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) diff --git a/smoke-test/tests/tokens/token_utils.py b/smoke-test/tests/tokens/token_utils.py new file mode 100644 index 00000000000000..10558e7085de72 --- /dev/null +++ b/smoke-test/tests/tokens/token_utils.py @@ -0,0 +1,53 @@ +from tests.utils import get_frontend_url + + +def getUserId(session): + response = session.get( + f"{get_frontend_url()}/openapi/operations/identity/user/urn", + params={"skipCache": "true"}, + ) + + response.raise_for_status() + return response.json() + + +def removeUser(session, urn): + # Remove user + json = { + "query": """mutation removeUser($urn: String!) { + removeUser(urn: $urn) + }""", + "variables": {"urn": urn}, + } + + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) + + response.raise_for_status() + return response.json() + + +def listUsers(session): + input = { + "start": "0", + "count": "20", + } + + # list users + json = { + "query": """query listUsers($input: ListUsersInput!) { + listUsers(input: $input) { + start + count + total + users { + username + } + } + }""", + "variables": {"input": input}, + } + + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) + + response.raise_for_status() + return response.json()