diff --git a/backend/dataall/modules/s3_datasets/api/dataset/enums.py b/backend/dataall/modules/s3_datasets/api/dataset/enums.py
new file mode 100644
index 000000000..16aa95907
--- /dev/null
+++ b/backend/dataall/modules/s3_datasets/api/dataset/enums.py
@@ -0,0 +1,9 @@
+from dataall.base.api.constants import GraphQLEnumMapper
+
+
+class MetadataGenerationTargets(GraphQLEnumMapper):
+ """Describes the s3_datasets metadata generation targets"""
+
+ Table = 'Table'
+ Folder = 'Folder'
+ S3_Dataset = 'S3_Dataset'
diff --git a/backend/dataall/modules/s3_datasets/api/dataset/input_types.py b/backend/dataall/modules/s3_datasets/api/dataset/input_types.py
index ced7ddf6a..0fb56bd40 100644
--- a/backend/dataall/modules/s3_datasets/api/dataset/input_types.py
+++ b/backend/dataall/modules/s3_datasets/api/dataset/input_types.py
@@ -47,6 +47,7 @@
],
)
+
DatasetPresignedUrlInput = gql.InputType(
name='DatasetPresignedUrlInput',
arguments=[
@@ -58,6 +59,14 @@
CrawlerInput = gql.InputType(name='CrawlerInput', arguments=[gql.Argument(name='prefix', type=gql.String)])
+TableSampleData = gql.InputType(
+ name='TableSampleData',
+ arguments=[
+ gql.Field(name='fields', type=gql.ArrayType(gql.String)),
+ gql.Field(name='rows', type=gql.ArrayType(gql.String)),
+ ],
+)
+
ImportDatasetInput = gql.InputType(
name='ImportDatasetInput',
arguments=[
diff --git a/backend/dataall/modules/s3_datasets/api/dataset/mutations.py b/backend/dataall/modules/s3_datasets/api/dataset/mutations.py
index d82f98194..04648d063 100644
--- a/backend/dataall/modules/s3_datasets/api/dataset/mutations.py
+++ b/backend/dataall/modules/s3_datasets/api/dataset/mutations.py
@@ -1,9 +1,5 @@
from dataall.base.api import gql
-from dataall.modules.s3_datasets.api.dataset.input_types import (
- ModifyDatasetInput,
- NewDatasetInput,
- ImportDatasetInput,
-)
+from dataall.modules.s3_datasets.api.dataset.input_types import ModifyDatasetInput, NewDatasetInput, ImportDatasetInput
from dataall.modules.s3_datasets.api.dataset.resolvers import (
create_dataset,
update_dataset,
@@ -11,7 +7,9 @@
delete_dataset,
import_dataset,
start_crawler,
+ generate_metadata,
)
+from dataall.modules.s3_datasets.api.dataset.enums import MetadataGenerationTargets
createDataset = gql.MutationField(
name='createDataset',
@@ -68,3 +66,14 @@
resolver=start_crawler,
type=gql.Ref('GlueCrawler'),
)
+generateMetadata = gql.MutationField(
+ name='generateMetadata',
+ args=[
+ gql.Argument(name='resourceUri', type=gql.NonNullableType(gql.String)),
+ gql.Argument(name='targetType', type=gql.NonNullableType(MetadataGenerationTargets.toGraphQLEnum())),
+ gql.Argument(name='metadataTypes', type=gql.NonNullableType(gql.ArrayType(gql.String))),
+ gql.Argument(name='tableSampleData', type=gql.Ref('TableSampleData')),
+ ],
+ type=gql.ArrayType(gql.Ref('DatasetMetadata')),
+ resolver=generate_metadata,
+)
diff --git a/backend/dataall/modules/s3_datasets/api/dataset/queries.py b/backend/dataall/modules/s3_datasets/api/dataset/queries.py
index 5043b868d..fdd9ed5aa 100644
--- a/backend/dataall/modules/s3_datasets/api/dataset/queries.py
+++ b/backend/dataall/modules/s3_datasets/api/dataset/queries.py
@@ -4,6 +4,8 @@
get_dataset_assume_role_url,
get_file_upload_presigned_url,
list_datasets_owned_by_env_group,
+ list_dataset_tables_folders,
+ read_sample_data,
)
getDataset = gql.QueryField(
@@ -45,3 +47,18 @@
resolver=list_datasets_owned_by_env_group,
test_scope='Dataset',
)
+listDatasetTablesFolders = gql.QueryField(
+ name='listDatasetTablesFolders',
+ args=[
+ gql.Argument(name='datasetUri', type=gql.NonNullableType(gql.String)),
+ gql.Argument(name='filter', type=gql.Ref('DatasetFilter')),
+ ],
+ type=gql.Ref('DatasetItemsSearchResult'),
+ resolver=list_dataset_tables_folders,
+)
+listSampleData = gql.QueryField(
+ name='listSampleData',
+ args=[gql.Argument(name='tableUri', type=gql.NonNullableType(gql.String))],
+ type=gql.Ref('QueryPreviewResult'), # basically returns nothing...?
+ resolver=read_sample_data,
+) # return the data -> user invokes generateMetadata again + sample data ; similar api exists
diff --git a/backend/dataall/modules/s3_datasets/api/dataset/resolvers.py b/backend/dataall/modules/s3_datasets/api/dataset/resolvers.py
index 90f6fd3d9..cadad1d8f 100644
--- a/backend/dataall/modules/s3_datasets/api/dataset/resolvers.py
+++ b/backend/dataall/modules/s3_datasets/api/dataset/resolvers.py
@@ -1,5 +1,5 @@
import logging
-
+import re
from dataall.base.api.context import Context
from dataall.base.feature_toggle_checker import is_feature_enabled
from dataall.base.utils.expiration_util import Expiration
@@ -11,6 +11,9 @@
from dataall.modules.s3_datasets.db.dataset_models import S3Dataset
from dataall.modules.datasets_base.services.datasets_enums import DatasetRole, ConfidentialityClassification
from dataall.modules.s3_datasets.services.dataset_service import DatasetService
+from dataall.modules.s3_datasets.services.dataset_table_service import DatasetTableService
+from dataall.modules.s3_datasets.services.dataset_location_service import DatasetLocationService
+from dataall.modules.s3_datasets.services.dataset_enums import MetadataGenerationTargets, MetadataGenerationTypes
log = logging.getLogger(__name__)
@@ -156,6 +159,49 @@ def list_datasets_owned_by_env_group(
return DatasetService.list_datasets_owned_by_env_group(environmentUri, groupUri, filter)
+@is_feature_enabled('modules.s3_datasets.features.generate_metadata_ai.active')
+def generate_metadata(
+ context: Context,
+ source: S3Dataset,
+ resourceUri: str,
+ targetType: str,
+ metadataTypes: list,
+ tableSampleData: dict = {},
+):
+ RequestValidator.validate_uri(param_name='resourceUri', param_value=resourceUri)
+ if any(metadata_type not in [item.value for item in MetadataGenerationTypes] for metadata_type in metadataTypes):
+ raise InvalidInput(
+ 'metadataType',
+ metadataTypes,
+ f'a list of allowed values {[item.value for item in MetadataGenerationTypes]}',
+ )
+ if targetType == MetadataGenerationTargets.S3_Dataset.value:
+ return DatasetService.generate_metadata_for_dataset(uri=resourceUri, metadata_types=metadataTypes)
+ elif targetType == MetadataGenerationTargets.Table.value:
+ return DatasetTableService.generate_metadata_for_table(
+ uri=resourceUri, metadata_types=metadataTypes, sample_data=tableSampleData
+ )
+ elif targetType == MetadataGenerationTargets.Folder.value:
+ return DatasetLocationService.generate_metadata_for_folder(uri=resourceUri, metadata_types=metadataTypes)
+ else:
+ raise Exception('Unsupported target type for metadata generation')
+
+
+def read_sample_data(context: Context, source: S3Dataset, tableUri: str):
+ RequestValidator.validate_uri(param_name='tableUri', param_value=tableUri)
+ return DatasetTableService.preview(uri=tableUri)
+
+
+def update_dataset_metadata(context: Context, source: S3Dataset, resourceUri: str):
+ return DatasetService.update_dataset(uri=resourceUri, data=input)
+
+
+def list_dataset_tables_folders(context: Context, source: S3Dataset, datasetUri: str, filter: dict = None):
+ if not filter:
+ filter = {}
+ return DatasetService.list_dataset_tables_folders(uri=datasetUri, filter=filter)
+
+
class RequestValidator:
@staticmethod
def validate_creation_request(data):
@@ -200,6 +246,18 @@ def validate_share_expiration_request(data):
'is of invalid type',
)
+ @staticmethod
+ def validate_uri(param_name: str, param_value: str):
+ if not param_value:
+ raise RequiredParameter(param_name)
+ pattern = r'^[a-z0-9]{8}$'
+ if not re.match(pattern, param_value):
+ raise InvalidInput(
+ param_name=param_name,
+ param_value=param_value,
+ constraint='8 characters long and contain only lowercase letters and numbers',
+ )
+
@staticmethod
def validate_import_request(data):
RequestValidator.validate_creation_request(data)
diff --git a/backend/dataall/modules/s3_datasets/api/dataset/types.py b/backend/dataall/modules/s3_datasets/api/dataset/types.py
index 2b257bb19..36bfcbf27 100644
--- a/backend/dataall/modules/s3_datasets/api/dataset/types.py
+++ b/backend/dataall/modules/s3_datasets/api/dataset/types.py
@@ -135,3 +135,39 @@
gql.Field(name='status', type=gql.String),
],
)
+
+DatasetMetadata = gql.ObjectType(
+ name='DatasetMetadata',
+ fields=[
+ gql.Field(name='targetUri', type=gql.String),
+ gql.Field(name='targetType', type=gql.String),
+ gql.Field(name='label', type=gql.String),
+ gql.Field(name='description', type=gql.String),
+ gql.Field(name='tags', type=gql.ArrayType(gql.String)),
+ gql.Field(name='topics', type=gql.ArrayType(gql.String)),
+ ],
+)
+
+DatasetItem = gql.ObjectType(
+ name='DatasetItem',
+ fields=[
+ gql.Field(name='name', type=gql.String),
+ gql.Field(name='targetType', type=gql.String),
+ gql.Field(name='targetUri', type=gql.String),
+ ],
+)
+
+DatasetItemsSearchResult = gql.ObjectType(
+ name='DatasetItemsSearchResult',
+ fields=[
+ gql.Field(name='count', type=gql.Integer),
+ gql.Field(name='nodes', type=gql.ArrayType(DatasetItem)),
+ gql.Field(name='pageSize', type=gql.Integer),
+ gql.Field(name='nextPage', type=gql.Integer),
+ gql.Field(name='pages', type=gql.Integer),
+ gql.Field(name='page', type=gql.Integer),
+ gql.Field(name='previousPage', type=gql.Integer),
+ gql.Field(name='hasNext', type=gql.Boolean),
+ gql.Field(name='hasPrevious', type=gql.Boolean),
+ ],
+)
diff --git a/backend/dataall/modules/s3_datasets/api/table_column/input_types.py b/backend/dataall/modules/s3_datasets/api/table_column/input_types.py
index ca32c83f9..2d8f90c77 100644
--- a/backend/dataall/modules/s3_datasets/api/table_column/input_types.py
+++ b/backend/dataall/modules/s3_datasets/api/table_column/input_types.py
@@ -18,3 +18,11 @@
gql.Argument('topics', gql.Integer),
],
)
+SubitemDescription = gql.InputType(
+ name='SubitemDescriptionInput',
+ arguments=[
+ gql.Argument(name='label', type=gql.String),
+ gql.Argument(name='description', type=gql.String),
+ gql.Argument(name='subitem_id', type=gql.String),
+ ],
+)
diff --git a/backend/dataall/modules/s3_datasets/api/table_column/mutations.py b/backend/dataall/modules/s3_datasets/api/table_column/mutations.py
index d9ae99b6d..3ee266ff6 100644
--- a/backend/dataall/modules/s3_datasets/api/table_column/mutations.py
+++ b/backend/dataall/modules/s3_datasets/api/table_column/mutations.py
@@ -1,5 +1,9 @@
from dataall.base.api import gql
-from dataall.modules.s3_datasets.api.table_column.resolvers import sync_table_columns, update_table_column
+from dataall.modules.s3_datasets.api.table_column.resolvers import (
+ sync_table_columns,
+ update_table_column,
+ batch_update_table_columns_description,
+)
syncDatasetTableColumns = gql.MutationField(
name='syncDatasetTableColumns',
@@ -18,3 +22,9 @@
type=gql.Ref('DatasetTableColumn'),
resolver=update_table_column,
)
+batchUpdateDatasetTableColumn = gql.MutationField(
+ name='batchUpdateDatasetTableColumn',
+ args=[gql.Argument(name='columns', type=gql.ArrayType(gql.Ref('SubitemDescriptionInput')))],
+ type=gql.String,
+ resolver=batch_update_table_columns_description,
+)
diff --git a/backend/dataall/modules/s3_datasets/api/table_column/resolvers.py b/backend/dataall/modules/s3_datasets/api/table_column/resolvers.py
index 07cb82d5a..3acbe2408 100644
--- a/backend/dataall/modules/s3_datasets/api/table_column/resolvers.py
+++ b/backend/dataall/modules/s3_datasets/api/table_column/resolvers.py
@@ -41,3 +41,9 @@ def update_table_column(context: Context, source, columnUri: str = None, input:
description = input.get('description', 'No description provided')
return DatasetColumnService.update_table_column_description(column_uri=columnUri, description=description)
+
+
+def batch_update_table_columns_description(context: Context, source, columns):
+ if columns is None:
+ return None
+ return DatasetColumnService.batch_update_table_columns_description(columns=columns)
diff --git a/backend/dataall/modules/s3_datasets/aws/bedrock_metadata_client.py b/backend/dataall/modules/s3_datasets/aws/bedrock_metadata_client.py
new file mode 100644
index 000000000..72f0cd21a
--- /dev/null
+++ b/backend/dataall/modules/s3_datasets/aws/bedrock_metadata_client.py
@@ -0,0 +1,101 @@
+import logging
+import os
+
+from dataall.base.db import exceptions
+from dataall.base.aws.sts import SessionHelper
+from typing import List, Optional
+from langchain_core.prompts import PromptTemplate
+from langchain_core.pydantic_v1 import BaseModel
+from langchain_aws import ChatBedrock as BedrockChat
+from langchain_core.output_parsers import JsonOutputParser
+
+log = logging.getLogger(__name__)
+
+METADATA_GENERATION_DATASET_TEMPLATE_PATH = os.path.join(
+ os.path.dirname(__file__), 'bedrock_prompts', 'metadata_generation_dataset_template.txt'
+)
+METADATA_GENERATION_TABLE_TEMPLATE_PATH = os.path.join(
+ os.path.dirname(__file__), 'bedrock_prompts', 'metadata_generation_table_template.txt'
+)
+METADATA_GENERATION_FOLDER_TEMPLATE_PATH = os.path.join(
+ os.path.dirname(__file__), 'bedrock_prompts', 'metadata_generation_folder_template.txt'
+)
+
+
+class MetadataOutput(BaseModel):
+ tags: Optional[List[str]] = None
+ description: Optional[str] = None
+ label: Optional[str] = None
+ topics: Optional[List[str]] = None
+ columns_metadata: Optional[List[dict]] = None
+
+
+class BedrockClient:
+ def __init__(self):
+ session = SessionHelper.get_session()
+ self._client = session.client('bedrock-runtime', region_name=os.getenv('AWS_REGION', 'eu-west-1'))
+ model_id = 'eu.anthropic.claude-3-5-sonnet-20240620-v1:0'
+ model_kwargs = {
+ 'max_tokens': 4096,
+ 'temperature': 0.5,
+ 'top_k': 250,
+ 'top_p': 0.5,
+ 'stop_sequences': ['\n\nHuman'],
+ }
+ self._model = BedrockChat(client=self._client, model_id=model_id, model_kwargs=model_kwargs)
+
+ def invoke_model_dataset_metadata(self, metadata_types, dataset, tables, folders):
+ try:
+ prompt_template = PromptTemplate.from_file(METADATA_GENERATION_DATASET_TEMPLATE_PATH)
+ parser = JsonOutputParser(pydantic_object=MetadataOutput)
+ chain = prompt_template | self._model | parser
+ context = {
+ 'metadata_types': metadata_types,
+ 'dataset_label': dataset.label,
+ 'description': dataset.description,
+ 'tags': dataset.tags,
+ 'topics': dataset.topics,
+ 'table_names': [t.label for t in tables],
+ 'table_descriptions': [t.description for t in tables],
+ 'folder_names': [f.label for f in folders],
+ }
+ return chain.invoke(context)
+ except Exception as e:
+ raise e
+
+ def invoke_model_table_metadata(self, metadata_types, table, columns, sample_data, generate_columns_metadata=False):
+ try:
+ prompt_template = PromptTemplate.from_file(METADATA_GENERATION_TABLE_TEMPLATE_PATH)
+ parser = JsonOutputParser(pydantic_object=MetadataOutput)
+ chain = prompt_template | self._model | parser
+ context = {
+ 'metadata_types': metadata_types,
+ 'generate_columns_metadata': generate_columns_metadata,
+ 'label': table.label,
+ 'description': table.description,
+ 'tags': table.tags,
+ 'topics': table.topics,
+ 'column_labels': [c.label for c in columns],
+ 'column_descriptions': [c.description for c in columns],
+ 'sample_data': sample_data,
+ }
+ return chain.invoke(context)
+ except Exception as e:
+ raise e
+
+ def invoke_model_folder_metadata(self, metadata_types, folder, files):
+ try:
+ prompt_template = PromptTemplate.from_file(METADATA_GENERATION_FOLDER_TEMPLATE_PATH)
+ parser = JsonOutputParser(pydantic_object=MetadataOutput)
+ chain = prompt_template | self._model | parser
+ context = {
+ 'metadata_types': metadata_types,
+ 'label': folder.label,
+ 'description': folder.description,
+ 'tags': folder.tags,
+ 'topics': folder.topics,
+ 'file_names': files,
+ }
+ return chain.invoke(context)
+ except Exception as e:
+ raise e
diff --git a/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_dataset_template.txt b/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_dataset_template.txt
new file mode 100644
index 000000000..c99c75c98
--- /dev/null
+++ b/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_dataset_template.txt
@@ -0,0 +1,82 @@
+Your task is to generate or improve the metadata fields of a Dataset.
+
+Use the following input parameters:
+ - Dataset name: {dataset_label}
+ - Current dataset description: {description}
+ - Current tags for dataset: {tags}
+ - Current topics for dataset: {topics}
+ - Table names in the dataset: {table_names}
+ - Folder names in the dataset: {folder_names}
+
+
+There are 4 metadata fields that can be requested to you.
+ 1. label - 1 to 3 words that give a "title" to the Dataset. If provided, you can use the current Dataset name as starting point.
+ 2. description - less than 30 words that summarize the Tables and Folders contained in the Dataset. If provided, use the current description and tags as starting point; but mainly use the Table names and Folder names.
+ 3. tags - list of strings (less than 3), where each string can take any value. Tags should highlight the most important field or thematic of the Dataset. If there are current tags that represent additional information, add them to the list of tags. Do not return the label as a tag.
+ 4. topics - list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the Tables and Folders of the dataset. If there are current topics that represent additional information, add them to the list of topics.
+
+There are some rules that you MUST follow:
+- If any of the input parameters is equal to "No description provided" or is None or [] do not use that particular input
+for generating the metadata fields.
+- This time the user has requested ONLY the following metadata fields: {metadata_types} Your response should strictly
+contain only the requested metadata fields.
+- Evaluate if the given parameters are sufficient for generating the requested metadata, if not, respond with
+"NotEnoughData" for all values of dictionary keys.
+- If the Table names and the Folder names are both none or [], return "Empty Dataset" as the description and "empty" as one of the tags.
+- Return the result as a Python dictionary where the keys are the requested metadata fields, all the keys must be
+lowercase and the values are the corresponding generated metadata.
+- Do not return any explanations, ONLY the Python dictionary.
+
+---------------------------------------
+---------------------------------------
+Here are some examples:
+
+Example 1.
+
+Given the following input parameters:
+ label: None,
+ description: No description provided,
+ tags: [],
+ table_names: [],
+ folder_names: [],
+ metadata_types: ["label", "description", "tags", "topics"]
+
+response = {{
+ "label": "NotEnoughData",
+ "description": "Empty Dataset",
+ "topics": "NotEnoughData",
+ "tags": ["empty"]
+}}
+
+Example 2.
+Given the following input parameters:
+ label: None,
+ description: No description provided,
+ tags: [],
+ table_names: ["customer_orders", "product_inventory", "sales_transactions"],
+ folder_names: ["orders", "inventory", "sales"],
+ metadata_types: ["label", "description"]
+
+response = {{
+ "label": "Sales and Inventory",
+ "description": "Dataset containing customer orders, product inventory, and sales transactions information, organized into orders, inventory, and sales folders."
+}}
+
+Example 3.
+Given the following input parameters:
+ label: None,
+ description: No description provided,
+ tags: [],
+ table_names: ["employee_data", "payroll", "performance_reviews"],,
+ folder_names: ["hr_records", "financial", "evaluations"],
+ metadata_types: ["label", "tags", "topics"]
+
+response = {{
+ "label": "HR Management System",
+ "tags": ["employee", "payroll", "performance"],
+ "topics": ["HR", "Finance"]
+}}
+
+
+
+
diff --git a/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_folder_template.txt b/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_folder_template.txt
new file mode 100644
index 000000000..37bcef15f
--- /dev/null
+++ b/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_folder_template.txt
@@ -0,0 +1,61 @@
+Your task is to generate or improve the metadata fields of a Folder (S3 Prefix).
+
+Use the following input parameters:
+ - Folder name: {label},
+ - Current Folder description: {description}
+ - Current tags for Folder: {tags}
+ - Current topics for Folder: {topics}
+ - File names (files stored inside the folder): {file_names}
+
+
+There are 4 metadata fields that can be requested to you.
+ 1. label - 1 to 3 words that give a "title" to the Folder. If provided, you can use the current Folder name as starting point.
+ 2. description - less than 30 words that summarize the files contained in the Folder. If provided, use the current description and tags as starting point; but mainly use the file names.
+ 3. tags - list of strings (less than 3), where each string can take any value. Tags should highlight the most important field or thematic of the Folder. If there are current tags that represent additional information, add them to the list of tags.
+ 4. topics - list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the files contained in the Folder. If there are current topics that represent additional information, add them to the list of topics.
+
+There are some rules that you MUST follow:
+- If any of the input parameters is equal to "No description provided" or is None or [] do not use that particular input
+for generating the metadata fields.
+- This time the user has requested ONLY the following metadata fields: {metadata_types} Your response should strictly
+contain only the requested metadata fields.
+- Evaluate if the given parameters are sufficient for generating the requested metadata, if not, respond with
+"NotEnoughData" for all values of dictionary keys.
+- Return the result as a Python dictionary where the keys are the requested metadata fields, all the keys must be
+lowercase and the values are the corresponding generated metadata.
+
+
+---------------------------------------
+---------------------------------------
+Here are some examples:
+
+Example 1.
+
+Given the following input parameters:
+ Folder name: my-folder-1
+ Current Folder description: No description provided
+ Current tags for Folder: []
+ File names (files stored inside the folder): []
+ metadata_types: ["label", "description", "topics"]
+
+
+response = {
+ "label": "NotEnoughData",
+ "description": "NotEnoughData",
+ "topics": "NotEnoughData"
+}
+
+Example 2.
+Given the following input parameters:
+ Folder name: my-folder-2
+ Current Folder description: Demo folder with some sample data.
+ Current tags for Folder: ['demo']
+ File names (files stored inside the folder): ['sales_report_2024.pdf', 'sales_terminology.pdf', 'year_over_year_increase.png']
+ metadata_types: ["label", description", "tags"]
+
+response = {
+ "label": "Sales Report 2024",
+ "description": "This folder contains the report alongside terminology and graphs to describe the sales of 2024."
+ "tags": ["sales", "2024"]
+}
+
diff --git a/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_table_template.txt b/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_table_template.txt
new file mode 100644
index 000000000..f11f069bb
--- /dev/null
+++ b/backend/dataall/modules/s3_datasets/aws/bedrock_prompts/metadata_generation_table_template.txt
@@ -0,0 +1,107 @@
+Your task is to generate or improve the metadata fields for a Table.
+
+Use the following input parameters:
+ - Table name: {label}
+ - Current table description: {description}
+ - Current tags for table: {tags}
+ - Current topics for table: {topics}
+ - Column names: {columns}
+ - Column Descriptions: {column_descriptions}
+ - Sample data: {sample_data}
+ - Generate columns metadata: {generate_columns_metadata}
+ - Column URIs: {column_uris} - IGNORE this input if Generate columns metadata=False
+
+There are 4 metadata fields that can be requested to you.
+ 1. label - 1 to 3 words that give a "title" to the Table. If provided, you can use the current Table name as starting point.
+ 2. description - less than 30 words that summarize the content of the table. If provided, take the current Table description as starting point but mainly use the sample data (if provided), and then the column names and descriptions to generate the table description.
+ 3. tags - list of strings (less than 3), where each string can take any value. Tags should highlight the most important field or thematic of the Table. If there are current tags that represent additional information, add them to the list of tags.
+ 4. topics - list of strings (1 or 2), where each string must be one of the following topics that represent company departments ['Finance', 'Marketing', 'Engineering', 'HR', 'Operations', 'Sales', 'Other'] Choose a topic according to the Table description. If there are current topics that represent additional information, add them to the list of topics.
+
+
+There are some rules that you MUST follow:
+- If any of the input parameters is equal to "No description provided" or is None or [] do not use that particular input
+for generating the metadata fields.
+- This time the user has requested ONLY the following metadata fields: {metadata_types} Your response should strictly
+contain only the requested metadata fields.
+- Evaluate if the given parameters are sufficient for generating the requested metadata, if not, respond with
+"NotEnoughData" for all values of dictionary keys.
+- Return the result as a Python dictionary where the keys are the requested metadata fields, all the keys must be
+lowercase and the values are the corresponding generated metadata.
+- If Generate columns metadata = True, you also need to generate the description of the columns. For this task try to use the
+sample data and the column names. Evaluate if that is sufficient for generating the column descriptions, if not,
+each description will be equal to "NotEnoughData". Add the result to the Python dictionary with the key "columns_metadata" and as
+value, another Python dictionary containing a list of objects whose keys are the columnUris and the values the generated column
+descriptions.
+
+---------------------------------------
+---------------------------------------
+Here are some examples:
+
+Example 1.
+
+Given the following input parameters:
+ Table name: my-table-1
+ Current table description: Demo data
+ Current tags for table: ["demo"]
+ Column URIs: []
+ Column names: []
+ Column Descriptions: []
+ Sample data: None
+ metadata_types: ["label", "description", "topics"]
+ Generate columns metadata: False
+
+
+response = {
+ "label": "NotEnoughData",
+ "description": "NotEnoughData",
+ "topics": "NotEnoughData"
+}
+
+Example 2.
+Given the following input parameters:
+ Table name: my-table-2
+ Current table description: Demo table with some sample data.
+ Current tags for table: ['demo']
+ Column URIs: ["asdfgh", "123ghj", "oplk34", "cvb890", "l29ghk", "cif41l"]
+ Column names: ['author', 'year', 'avg_rating', 'pages', 'genre', 'count']
+ Column Descriptions: ['Author that wrote the book', 'Year of publication', 'Average rating in Amazon.com', 'pages', 'genre', 'books sold in Amazon.com']
+ Sample data: None
+ metadata_types: ["label", description", "tags"]
+ Generate columns metadata: False
+
+response = {
+ "label": "books_sales_with_ratings",
+ "description": "This table is used for demo purposes and it contains book-related information including authorship, publication details, and performance metrics. It tracks books with their authors, publication years, and reader feedback through Amazon.com ratings."
+ "tags": ["demo", "sales"]
+}
+
+
+Example 3.
+Given the following input parameters:
+ Table name: my-table-2
+ Current table description: Demo table with some sample data.
+ Current tags for table: ['demo']
+ Column URIs: ["asdfgh", "123ghj", "oplk34", "cvb890", "l29ghk", "cif41l"]
+ Column names: ['author', 'year', 'avg_rating_amzn', 'pages', 'genre', 'sales_amzn']
+ Column Descriptions: []
+ Sample data: {"rows": ["[\"\\\"JK Rowling\\\"\", \"\\\"1995\\\"\", \"\\\"8\\\"\", \"\\\"419\\\"\", \"\\\"Science Fiction\\\"\", \"\\\"225890345\\\"\"]", [\"\\\"Gabriel Garcia Marquez\\\"\", \"\\\"1913\\\"\", \"\\\"9\\\"\", \"\\\"337\\\"\", \"\\\"Magical Realism\\\"\", \"\\\"221133\\\"\"]], "fields": ["{\"name\": \"author\"}", "{\"name\": \"year\"}", "{\"name\": \"avg_rating\"}", "{\"name\": \"pages\"}", "{\"name\": \"genre\"}", "{\"name\": \"count\"}"], "__typename": "QueryPreviewResult"}}}
+ metadata_types: ["label", description", "tags"]
+ Generate columns metadata: True
+
+response = {
+ "label": "books_sales_with_ratings",
+ "description": "This table is used for demo purposes and it contains book-related information including authorship, publication details, and performance metrics. It tracks books with their authors, publication years, and reader feedback through Amazon.com ratings."
+ "tags": ["demo", "sales"],
+ "columns_metadata": {
+ "asdfgh": "Author that wrote the book",
+ "123ghj": "Year of publication"
+ "oplk34": "Average rating in Amazon.com"
+ "cvb890": "Number of pages"
+ "l29ghk": "Genre"
+ "cif41l": "Number of copies sold in Amazon.com"
+ }
+}
+
+
+
+
diff --git a/backend/dataall/modules/s3_datasets/aws/s3_dataset_client.py b/backend/dataall/modules/s3_datasets/aws/s3_dataset_client.py
index 94db4d056..778edb508 100644
--- a/backend/dataall/modules/s3_datasets/aws/s3_dataset_client.py
+++ b/backend/dataall/modules/s3_datasets/aws/s3_dataset_client.py
@@ -73,3 +73,16 @@ def get_bucket_encryption(self) -> (str, str, str):
f'Data.all Environment Pivot Role does not have s3:GetEncryptionConfiguration Permission for {dataset.S3BucketName} bucket: {e}'
)
raise Exception(f'Cannot fetch the bucket encryption configuration for {dataset.S3BucketName}: {e}')
+
+ def list_bucket_files(self, bucket_name, prefix):
+ dataset = self._dataset
+ try:
+ response = self._client.list_objects_v2(
+ Bucket=bucket_name,
+ Prefix=prefix,
+ ExpectedBucketOwner=dataset.AwsAccountId,
+ MaxKeys=1000,
+ )
+ return response.get('Contents', [])
+ except ClientError as e:
+ raise Exception(f'Cannot list the bucket files for {dataset.S3BucketName}: {e}')
diff --git a/backend/dataall/modules/s3_datasets/db/dataset_column_repositories.py b/backend/dataall/modules/s3_datasets/db/dataset_column_repositories.py
index c2038084b..17e7eb733 100644
--- a/backend/dataall/modules/s3_datasets/db/dataset_column_repositories.py
+++ b/backend/dataall/modules/s3_datasets/db/dataset_column_repositories.py
@@ -1,5 +1,5 @@
from operator import or_
-
+from sqlalchemy import func, and_
from dataall.base.db import paginate
from dataall.base.db.exceptions import ObjectNotFound
from dataall.modules.s3_datasets.db.dataset_models import DatasetTableColumn
@@ -42,3 +42,30 @@ def paginate_active_columns_for_table(session, table_uri: str, filter: dict):
).order_by(DatasetTableColumn.columnType.asc())
return paginate(query=q, page=filter.get('page', 1), page_size=filter.get('pageSize', 10)).to_dict()
+
+ @staticmethod
+ def get_table_info_metadata_generation(session, table_uri: str):
+ result = (
+ session.query(
+ DatasetTableColumn.GlueTableName,
+ DatasetTableColumn.AWSAccountId,
+ func.array_agg(DatasetTableColumn.description).label('description'),
+ func.array_agg(DatasetTableColumn.label).label('label'),
+ func.array_agg(DatasetTableColumn.columnUri).label('columnUri'),
+ )
+ .filter(and_(DatasetTableColumn.tableUri == table_uri))
+ .group_by(DatasetTableColumn.GlueTableName, DatasetTableColumn.AWSAccountId)
+ .first()
+ )
+ return result
+
+ @staticmethod
+ def query_active_columns_for_table(session, table_uri: str):
+ return (
+ session.query(DatasetTableColumn)
+ .filter(
+ DatasetTableColumn.tableUri == table_uri,
+ DatasetTableColumn.deleted.is_(None),
+ )
+ .order_by(DatasetTableColumn.columnType.asc())
+ )
diff --git a/backend/dataall/modules/s3_datasets/db/dataset_models.py b/backend/dataall/modules/s3_datasets/db/dataset_models.py
index 3e9291485..819d9065d 100644
--- a/backend/dataall/modules/s3_datasets/db/dataset_models.py
+++ b/backend/dataall/modules/s3_datasets/db/dataset_models.py
@@ -46,7 +46,10 @@ class DatasetStorageLocation(Resource, Base):
S3BucketName = Column(String, nullable=False)
S3Prefix = Column(String, nullable=False)
S3AccessPoint = Column(String, nullable=True)
+ label = Column(String, nullable=False)
region = Column(String, default='eu-west-1')
+ description = Column(String, nullable=True)
+ tags = Column(ARRAY(String))
locationCreated = Column(Boolean, default=False)
userRoleForStorageLocation = query_expression()
projectPermission = query_expression()
diff --git a/backend/dataall/modules/s3_datasets/db/dataset_repositories.py b/backend/dataall/modules/s3_datasets/db/dataset_repositories.py
index 075575a2a..feedf7fb5 100644
--- a/backend/dataall/modules/s3_datasets/db/dataset_repositories.py
+++ b/backend/dataall/modules/s3_datasets/db/dataset_repositories.py
@@ -1,6 +1,7 @@
import logging
-from sqlalchemy import and_, or_
+import sqlalchemy
+from sqlalchemy import and_, or_, literal
from sqlalchemy.orm import Query
from dataall.core.activity.db.activity_models import Activity
from dataall.core.environment.db.environment_models import Environment
@@ -9,7 +10,7 @@
from dataall.base.db.exceptions import ObjectNotFound
from dataall.modules.datasets_base.services.datasets_enums import ConfidentialityClassification, Language
from dataall.core.environment.services.environment_resource_manager import EnvironmentResource
-from dataall.modules.s3_datasets.db.dataset_models import DatasetTable, S3Dataset
+from dataall.modules.s3_datasets.db.dataset_models import DatasetTable, S3Dataset, DatasetStorageLocation
from dataall.base.utils.naming_convention import (
NamingConventionService,
NamingConventionPattern,
@@ -278,3 +279,41 @@ def _set_import_data(dataset, data):
dataset.importedAdminRole = True if data.get('adminRoleName') else False
if data.get('imported'):
dataset.KmsAlias = data.get('KmsKeyAlias') if data.get('KmsKeyAlias') else 'SSE-S3'
+
+ @staticmethod
+ def query_dataset_tables_folders(session, dataset_uri):
+ q1 = (
+ session.query(
+ S3Dataset.datasetUri,
+ DatasetTable.tableUri.label('targetUri'),
+ DatasetTable.name.label('name'),
+ literal('Table', type_=sqlalchemy.types.String).label('targetType'),
+ )
+ .join(
+ DatasetTable,
+ DatasetTable.datasetUri == S3Dataset.datasetUri,
+ )
+ .filter(S3Dataset.datasetUri == dataset_uri)
+ )
+ q2 = (
+ session.query(
+ S3Dataset.datasetUri,
+ DatasetStorageLocation.locationUri.label('targetUri'),
+ DatasetStorageLocation.name.label('name'),
+ literal('Folder', type_=sqlalchemy.types.String).label('targetType'),
+ )
+ .join(
+ DatasetStorageLocation,
+ DatasetStorageLocation.datasetUri == S3Dataset.datasetUri,
+ )
+ .filter(S3Dataset.datasetUri == dataset_uri)
+ )
+ return q1.union(q2)
+
+ @staticmethod
+ def paginated_dataset_tables_folders(session, dataset_uri, data):
+ return paginate(
+ query=DatasetRepository.query_dataset_tables_folders(session, dataset_uri),
+ page=data.get('page', 1),
+ page_size=data.get('pageSize', 10),
+ ).to_dict()
diff --git a/backend/dataall/modules/s3_datasets/services/dataset_column_service.py b/backend/dataall/modules/s3_datasets/services/dataset_column_service.py
index 987b855a4..40987c4c7 100644
--- a/backend/dataall/modules/s3_datasets/services/dataset_column_service.py
+++ b/backend/dataall/modules/s3_datasets/services/dataset_column_service.py
@@ -70,3 +70,11 @@ def update_table_column_description(column_uri: str, description) -> DatasetTabl
Worker.queue(engine=get_context().db_engine, task_ids=[task.taskUri])
return column
+
+ @staticmethod
+ def batch_update_table_columns_description(columns):
+ for column_ in columns:
+ DatasetColumnService.update_table_column_description(
+ column_uri=column_['subitem_id'], description=column_['description']
+ )
+ return 'Success'
diff --git a/backend/dataall/modules/s3_datasets/services/dataset_enums.py b/backend/dataall/modules/s3_datasets/services/dataset_enums.py
new file mode 100644
index 000000000..f531fbfe1
--- /dev/null
+++ b/backend/dataall/modules/s3_datasets/services/dataset_enums.py
@@ -0,0 +1,18 @@
+from enum import Enum
+
+
+class MetadataGenerationTargets(Enum):
+ """Describes the s3_datasets metadata generation types"""
+
+ Table = 'Table'
+ Folder = 'Folder'
+ S3_Dataset = 'S3_Dataset'
+
+
+class MetadataGenerationTypes(Enum):
+ """Describes the s3_datasets metadata generation types"""
+
+ Description = 'description'
+ Label = 'label'
+ Tag = 'tags'
+ Topic = 'topics'
diff --git a/backend/dataall/modules/s3_datasets/services/dataset_location_service.py b/backend/dataall/modules/s3_datasets/services/dataset_location_service.py
index ee83d1c5f..f2ed0c05e 100644
--- a/backend/dataall/modules/s3_datasets/services/dataset_location_service.py
+++ b/backend/dataall/modules/s3_datasets/services/dataset_location_service.py
@@ -1,7 +1,12 @@
+import logging
+
from dataall.modules.s3_datasets.indexers.dataset_indexer import DatasetIndexer
from dataall.base.context import get_context
from dataall.core.permissions.services.resource_policy_service import ResourcePolicyService
from dataall.core.permissions.services.tenant_policy_service import TenantPolicyService
+
+##TODO
+##from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
from dataall.modules.catalog.db.glossary_repositories import GlossaryRepository
from dataall.base.db.exceptions import ResourceShared, ResourceAlreadyExists
from dataall.modules.s3_datasets.services.dataset_service import DatasetService
@@ -18,6 +23,10 @@
from dataall.modules.s3_datasets.services.dataset_permissions import DATASET_FOLDER_READ, GET_DATASET_FOLDER
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.s3_datasets.db.dataset_models import DatasetStorageLocation, S3Dataset
+from dataall.modules.s3_datasets.aws.bedrock_metadata_client import BedrockClient
+from dataall.modules.s3_datasets.aws.s3_dataset_client import S3DatasetClient
+
+log = logging.getLogger(__name__)
class DatasetLocationService:
@@ -135,3 +144,19 @@ def _delete_dataset_folder_read_permission(session, dataset: S3Dataset, location
}
for group in permission_group:
ResourcePolicyService.delete_resource_policy(session=session, group=group, resource_uri=location_uri)
+
+ @staticmethod
+ @ResourcePolicyService.has_resource_permission(UPDATE_DATASET_FOLDER)
+ # @ResourceThresholdService.check_invocation_count(
+ # 'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
+ # )
+ def generate_metadata_for_folder(uri, metadata_types):
+ context = get_context()
+ with context.db_engine.scoped_session() as session:
+ folder = DatasetLocationRepository.get_location_by_uri(session, uri)
+ dataset = DatasetRepository.get_dataset_by_uri(session, folder.datasetUri)
+ files = S3DatasetClient(dataset).list_bucket_files(folder.S3BucketName, folder.S3Prefix)
+ metadata = BedrockClient().invoke_model_folder_metadata(
+ metadata_types=metadata_types, folder=folder, files=[f['Key'] for f in files]
+ )
+ return [{'targetUri': uri, 'targetType': 'Folder'} | metadata]
diff --git a/backend/dataall/modules/s3_datasets/services/dataset_service.py b/backend/dataall/modules/s3_datasets/services/dataset_service.py
index a73486457..f496ccdc3 100644
--- a/backend/dataall/modules/s3_datasets/services/dataset_service.py
+++ b/backend/dataall/modules/s3_datasets/services/dataset_service.py
@@ -19,6 +19,9 @@
from dataall.core.stacks.db.stack_repositories import StackRepository
from dataall.core.stacks.db.stack_models import Stack
from dataall.core.tasks.db.task_models import Task
+
+##TODO
+##from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
from dataall.modules.catalog.db.glossary_repositories import GlossaryRepository
from dataall.modules.s3_datasets.db.dataset_bucket_repositories import DatasetBucketRepository
from dataall.modules.shares_base.db.share_object_repositories import ShareObjectRepository
@@ -29,6 +32,7 @@
from dataall.modules.s3_datasets.db.dataset_table_repositories import DatasetTableRepository
from dataall.modules.s3_datasets.indexers.dataset_indexer import DatasetIndexer
from dataall.modules.s3_datasets.services.dataset_permissions import (
+ GET_DATASET,
CREDENTIALS_DATASET,
CRAWL_DATASET,
DELETE_DATASET,
@@ -46,6 +50,7 @@
from dataall.modules.datasets_base.db.dataset_models import DatasetBase
from dataall.modules.s3_datasets.services.dataset_permissions import DATASET_TABLE_ALL
from dataall.modules.datasets_base.services.dataset_service_interface import DatasetServiceInterface
+from dataall.modules.s3_datasets.aws.bedrock_metadata_client import BedrockClient
log = logging.getLogger(__name__)
@@ -561,3 +566,29 @@ def _delete_dataset_term_links(session, dataset_uri):
for table_uri in tables:
GlossaryRepository.delete_glossary_terms_links(session, table_uri, 'DatasetTable')
GlossaryRepository.delete_glossary_terms_links(session, dataset_uri, 'Dataset')
+
+ @staticmethod
+ @ResourcePolicyService.has_resource_permission(GET_DATASET)
+ def list_dataset_tables_folders(uri, filter):
+ context = get_context()
+ with context.db_engine.scoped_session() as session:
+ return DatasetRepository.paginated_dataset_tables_folders(session, uri, filter)
+
+ @staticmethod
+ @ResourcePolicyService.has_resource_permission(UPDATE_DATASET)
+ # @ResourceThresholdService.check_invocation_count(
+ # 'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
+ # )
+ def generate_metadata_for_dataset(uri, metadata_types):
+ context = get_context()
+ with context.db_engine.scoped_session() as session:
+ dataset = DatasetBaseRepository.get_dataset_by_uri(session, uri)
+ tables = DatasetRepository.get_dataset_tables(session, dataset.datasetUri)
+ folders = DatasetLocationRepository.get_dataset_folders(session, dataset.datasetUri)
+ metadata = BedrockClient().invoke_model_dataset_metadata(
+ metadata_types=metadata_types,
+ dataset=dataset,
+ tables=tables,
+ folders=folders,
+ )
+ return [{'targetUri': uri, 'targetType': 'S3_Dataset'} | metadata]
diff --git a/backend/dataall/modules/s3_datasets/services/dataset_table_service.py b/backend/dataall/modules/s3_datasets/services/dataset_table_service.py
index 386ab252e..375a3c490 100644
--- a/backend/dataall/modules/s3_datasets/services/dataset_table_service.py
+++ b/backend/dataall/modules/s3_datasets/services/dataset_table_service.py
@@ -3,6 +3,9 @@
from dataall.base.context import get_context
from dataall.core.permissions.services.resource_policy_service import ResourcePolicyService
from dataall.core.permissions.services.tenant_policy_service import TenantPolicyService
+
+##TODO
+##from dataall.core.resource_threshold.services.resource_threshold_service import ResourceThresholdService
from dataall.modules.catalog.db.glossary_repositories import GlossaryRepository
from dataall.core.environment.services.environment_service import EnvironmentService
from dataall.modules.s3_datasets.aws.athena_table_client import AthenaTableClient
@@ -29,6 +32,9 @@
from dataall.modules.s3_datasets.services.dataset_service import DatasetService
from dataall.base.utils import json_utils
from dataall.base.db import exceptions
+from dataall.modules.s3_datasets.aws.bedrock_metadata_client import BedrockClient
+from dataall.modules.s3_datasets.db.dataset_column_repositories import DatasetColumnRepository
+
log = logging.getLogger(__name__)
@@ -183,3 +189,24 @@ def _delete_dataset_table_read_permission(session, table_uri):
ResourcePolicyService.delete_resource_policy(
session=session, group=None, resource_uri=table_uri, resource_type=DatasetTable.__name__
)
+
+ @staticmethod
+ @ResourcePolicyService.has_resource_permission(UPDATE_DATASET_TABLE)
+ # @ResourceThresholdService.check_invocation_count(
+ # 'metadata', 'modules.s3_datasets.features.generate_metadata_ai.max_count_per_day'
+ # )
+ def generate_metadata_for_table(uri, metadata_types, sample_data):
+ context = get_context()
+ with context.db_engine.scoped_session() as session:
+ table = DatasetTableRepository.get_dataset_table_by_uri(session, uri)
+ table_columns = DatasetColumnRepository.get_table_info_metadata_generation(session, table.tableUri)
+ metadata = BedrockClient().invoke_model_table_metadata(
+ table=table, columns=table_columns, metadata_types=metadata_types, sample_data=sample_data
+ )
+ columns_metadata = metadata.get('columns_metadata')
+ table_metadata = metadata.pop('columns_metadata')
+
+ return [{'targetUri': uri, 'targetType': 'Table' | table_metadata}] + [
+ {'targetUri': key, 'targetType': 'Table_Column', 'description': value}
+ for key, value in columns_metadata.items()
+ ]
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 05cb6619c..f8c34c124 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -13,3 +13,5 @@ requests_aws4auth==1.1.1
sqlalchemy==1.3.24
alembic==1.13.1
retrying==1.3.4
+langchain-aws==0.2.2
+langchain-core==0.3.11
diff --git a/config.json b/config.json
index e3af66063..e621d43f7 100644
--- a/config.json
+++ b/config.json
@@ -58,7 +58,11 @@
"preview_data": true,
"glue_crawler": true,
"metrics_data": true,
- "show_stack_logs": "enabled"
+ "show_stack_logs": "enabled",
+ "generate_metadata_ai": {
+ "active": true,
+ "max_count_per_day": 25
+ }
}
},
"shares_base": {
diff --git a/frontend/src/modules/S3_Datasets/components/GenerateMetadataComponent.js b/frontend/src/modules/S3_Datasets/components/GenerateMetadataComponent.js
new file mode 100644
index 000000000..a88cc5c1d
--- /dev/null
+++ b/frontend/src/modules/S3_Datasets/components/GenerateMetadataComponent.js
@@ -0,0 +1,417 @@
+import { useState, useCallback } from 'react';
+import {
+ Avatar,
+ Box,
+ Button,
+ Checkbox,
+ Chip,
+ Divider,
+ FormControl,
+ FormGroup,
+ FormControlLabel,
+ FormLabel,
+ Grid,
+ InputLabel,
+ MenuItem,
+ Select,
+ Switch,
+ Typography
+} from '@mui/material';
+import { DataGrid } from '@mui/x-data-grid';
+import { useSnackbar } from 'notistack';
+import PropTypes from 'prop-types';
+import AutoModeIcon from '@mui/icons-material/AutoMode';
+import { Defaults, Scrollbar } from 'design';
+import { SET_ERROR, useDispatch } from 'globalErrors';
+import { useClient } from 'services';
+import { listDatasetTablesFolders, generateMetadataBedrock } from '../services';
+
+export const GenerateMetadataComponent = (props) => {
+ const {
+ dataset,
+ targetType,
+ setTargetType,
+ targets,
+ setTargets,
+ targetOptions,
+ setTargetOptions,
+ selectedMetadataTypes,
+ setSelectedMetadataTypes,
+ currentView,
+ setCurrentView,
+ loadingMetadata,
+ setLoadingMetadata,
+ ...other
+ } = props;
+ const { enqueueSnackbar } = useSnackbar();
+ const dispatch = useDispatch();
+
+ const client = useClient();
+ const [loadingTableFolder, setLoadingTableFolder] = useState(false);
+ const [tableFolderFilter, setTableFolderFilter] = useState(Defaults.filter);
+ const handleChange = useCallback(
+ async (event) => {
+ setTargetType(event.target.value);
+ if (event.target.value === 'Dataset') {
+ setTargets([
+ {
+ targetUri: dataset.datasetUri,
+ targetType: 'S3_Dataset',
+ name: dataset.name
+ }
+ ]);
+ } else {
+ setTargets([]);
+ setLoadingTableFolder(true);
+ const response = await client.query(
+ listDatasetTablesFolders({
+ datasetUri: dataset.datasetUri,
+ filter: tableFolderFilter
+ })
+ );
+ if (!response.errors) {
+ setTargetOptions(response.data.listDatasetTablesFolders);
+ } else {
+ dispatch({
+ type: SET_ERROR,
+ error: response.errors[0].message + dataset.datasetUri
+ });
+ }
+ setLoadingTableFolder(false);
+ }
+ },
+ [client, dispatch]
+ );
+
+ const handleMetadataChange = (event) => {
+ setSelectedMetadataTypes({
+ ...selectedMetadataTypes,
+ [event.target.name]: event.target.checked
+ });
+ };
+
+ const handlePageChange = async (page) => {
+ page += 1; //expecting 1-indexing
+ if (page <= targetOptions.pages && page !== targetOptions.page) {
+ await setTableFolderFilter({ ...tableFolderFilter, page: page });
+ }
+ };
+
+ const generateMetadata = async () => {
+ for (let target of targets) {
+ let response = await client.mutate(
+ generateMetadataBedrock({
+ resourceUri: target.targetUri,
+ targetType: target.targetType,
+ metadataTypes: Object.entries(selectedMetadataTypes)
+ .filter(([key, value]) => value === true)
+ .map(([key]) => key),
+ tableSampleData: {}
+ })
+ );
+ if (!response.errors) {
+ const matchingResponse = response.data.generateMetadata.find(
+ (item) =>
+ item.targetUri === target.targetUri &&
+ item.targetType === target.targetType
+ );
+
+ if (matchingResponse) {
+ target.description = matchingResponse.description;
+ target.label = matchingResponse.label;
+ target.tags = matchingResponse.tags;
+ target.topics = matchingResponse.topics;
+ }
+ const hasNotEnoughData = [
+ target.description,
+ target.label,
+ target.tags,
+ target.topics
+ ].some((value) => value === 'NotEnoughData');
+
+ if (hasNotEnoughData) {
+ enqueueSnackbar(
+ `Not enough data to generate metadata for ${target.name}`,
+ {
+ anchorOrigin: {
+ horizontal: 'right',
+ vertical: 'top'
+ },
+ variant: 'warning'
+ }
+ );
+ } else {
+ enqueueSnackbar(
+ `Metadata generation is successful for ${target.name}`,
+ {
+ anchorOrigin: {
+ horizontal: 'right',
+ vertical: 'top'
+ },
+ variant: 'success'
+ }
+ );
+ }
+ } else {
+ target.description = 'Error';
+ target.label = 'Error';
+ target.tags = 'Error';
+ target.topics = 'Error';
+ dispatch({
+ type: SET_ERROR,
+ error: response.errors[0].message + dataset.datasetUri
+ });
+ }
+ }
+ setCurrentView('REVIEW_METADATA');
+ };
+ return (
+ <>
+