-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(ingest): set pipeline name in system metadata (#10190)
Co-authored-by: david-leifker <[email protected]>
- Loading branch information
1 parent
0417e68
commit f4be88d
Showing
31 changed files
with
5,053 additions
and
12,894 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
79 changes: 79 additions & 0 deletions
79
metadata-ingestion/src/datahub/ingestion/transformer/auto_helper_transformer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
from typing import Callable, Iterable, Optional, Union | ||
|
||
from datahub.emitter.mcp import MetadataChangeProposalWrapper | ||
from datahub.ingestion.api.common import ControlRecord, PipelineContext, RecordEnvelope | ||
from datahub.ingestion.api.transform import Transformer | ||
from datahub.ingestion.api.workunit import MetadataWorkUnit | ||
from datahub.metadata.com.linkedin.pegasus2avro.mxe import ( | ||
MetadataChangeEvent, | ||
MetadataChangeProposal, | ||
) | ||
|
||
|
||
class AutoHelperTransformer(Transformer): | ||
"""Converts an auto_* source helper into a transformer. | ||
Important usage note: this assumes that the auto helper is stateless. The converter | ||
will be called multiple times, once for each batch of records. If the helper | ||
attempts to maintain state or perform some cleanup at the end of the stream, it | ||
will not behave correctly. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
converter: Callable[[Iterable[MetadataWorkUnit]], Iterable[MetadataWorkUnit]], | ||
): | ||
self.converter = converter | ||
|
||
def transform( | ||
self, record_envelopes: Iterable[RecordEnvelope] | ||
) -> Iterable[RecordEnvelope]: | ||
records = list(record_envelopes) | ||
|
||
normal_records = [r for r in records if not isinstance(r.record, ControlRecord)] | ||
control_records = [r for r in records if isinstance(r.record, ControlRecord)] | ||
|
||
yield from self._from_workunits( | ||
self.converter( | ||
self._into_workunits(normal_records), | ||
) | ||
) | ||
|
||
# Pass through control records as-is. Note that this isn't fully correct, since it technically | ||
# reorders the control records relative to the normal records. This is ok since the only control | ||
# record we have marks the end of the stream. | ||
yield from control_records | ||
|
||
@classmethod | ||
def _into_workunits( | ||
cls, | ||
stream: Iterable[ | ||
RecordEnvelope[ | ||
Union[ | ||
MetadataChangeEvent, | ||
MetadataChangeProposal, | ||
MetadataChangeProposalWrapper, | ||
] | ||
] | ||
], | ||
) -> Iterable[MetadataWorkUnit]: | ||
for record in stream: | ||
workunit_id: Optional[str] = record.metadata.get("workunit_id") | ||
metadata = record.record | ||
yield MetadataWorkUnit.from_metadata(metadata, id=workunit_id) | ||
|
||
@classmethod | ||
def _from_workunits( | ||
cls, stream: Iterable[MetadataWorkUnit] | ||
) -> Iterable[RecordEnvelope]: | ||
for workunit in stream: | ||
yield RecordEnvelope( | ||
workunit.metadata, | ||
{ | ||
"workunit_id": workunit.id, | ||
}, | ||
) | ||
|
||
@classmethod | ||
def create(cls, config_dict: dict, ctx: PipelineContext) -> Transformer: | ||
raise NotImplementedError(f"{cls.__name__} cannot be created from config") |
45 changes: 45 additions & 0 deletions
45
metadata-ingestion/src/datahub/ingestion/transformer/system_metadata_transformer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import functools | ||
from typing import Iterable | ||
|
||
from datahub.emitter.mce_builder import get_sys_time | ||
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope | ||
from datahub.ingestion.api.transform import Transformer | ||
from datahub.ingestion.api.workunit import MetadataWorkUnit | ||
from datahub.ingestion.transformer.auto_helper_transformer import AutoHelperTransformer | ||
from datahub.metadata.schema_classes import SystemMetadataClass | ||
|
||
|
||
def auto_system_metadata( | ||
ctx: PipelineContext, | ||
stream: Iterable[MetadataWorkUnit], | ||
) -> Iterable[MetadataWorkUnit]: | ||
if not ctx.pipeline_config: | ||
raise ValueError("Pipeline config is required for system metadata") | ||
set_system_metadata = ctx.pipeline_config.flags.set_system_metadata | ||
set_pipeline_name = ctx.pipeline_config.flags.set_system_metadata_pipeline_name | ||
|
||
for workunit in stream: | ||
if set_system_metadata: | ||
workunit.metadata.systemMetadata = SystemMetadataClass( | ||
lastObserved=get_sys_time(), runId=ctx.run_id | ||
) | ||
if set_pipeline_name: | ||
workunit.metadata.systemMetadata.pipelineName = ctx.pipeline_name | ||
|
||
yield workunit | ||
|
||
|
||
class SystemMetadataTransformer(Transformer): | ||
def __init__(self, ctx: PipelineContext): | ||
self._inner_transfomer = AutoHelperTransformer( | ||
functools.partial(auto_system_metadata, ctx) | ||
) | ||
|
||
def transform( | ||
self, record_envelopes: Iterable[RecordEnvelope] | ||
) -> Iterable[RecordEnvelope]: | ||
yield from self._inner_transfomer.transform(record_envelopes) | ||
|
||
@classmethod | ||
def create(cls, config_dict: dict, ctx: PipelineContext) -> Transformer: | ||
raise NotImplementedError(f"{cls.__name__} cannot be created from config") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.