From bdc60fb94a8fefcc006463f635175e59f09262a8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Apr 2024 09:00:14 -0700 Subject: [PATCH 001/110] mypy formated some schema files --- schematic/schemas/commands.py | 6 ++--- schematic/schemas/data_model_graph.py | 21 ++++++++++----- schematic/schemas/data_model_json_schema.py | 14 +++++----- schematic/schemas/data_model_jsonld.py | 26 ++++++++++++------- schematic/schemas/data_model_relationships.py | 2 +- schematic/schemas/data_model_validator.py | 14 +++++----- 6 files changed, 48 insertions(+), 35 deletions(-) diff --git a/schematic/schemas/commands.py b/schematic/schemas/commands.py index 17f622a6c..80583734b 100644 --- a/schematic/schemas/commands.py +++ b/schematic/schemas/commands.py @@ -3,7 +3,7 @@ import logging import time import re -from typing import get_args +from typing import get_args, Optional import click import click_log # type: ignore @@ -29,7 +29,7 @@ # invoke_without_command=True -> forces the application not to show aids before # losing them with a --h @click.group(context_settings=CONTEXT_SETTINGS, invoke_without_command=True) -def schema(): # use as `schematic model ...` +def schema() -> None: # use as `schematic model ...` """ Sub-commands for Schema related utilities/methods. """ @@ -59,7 +59,7 @@ def schema(): # use as `schematic model ...` metavar="", help=query_dict(schema_commands, ("schema", "convert", "output_jsonld")), ) -def convert(schema, data_model_labels, output_jsonld): +def convert(schema, data_model_labels: DisplayLabelType, output_jsonld: Optional[str]): """ Running CLI to convert data model specification in CSV format to data model in JSON-LD format. diff --git a/schematic/schemas/data_model_graph.py b/schematic/schemas/data_model_graph.py index d4a12103c..037405b95 100644 --- a/schematic/schemas/data_model_graph.py +++ b/schematic/schemas/data_model_graph.py @@ -29,13 +29,15 @@ class DataModelGraphMeta: # pylint: disable=too-few-public-methods _instances: dict = {} - def __call__(cls, *args: Any, **kwargs: Any): # pylint: disable=no-self-argument + def __call__( # pylint: disable=no-self-argument + cls, *args: Any, **kwargs: Any + ) -> Any: """ Possible changes to the value of the `__init__` argument do not affect the returned instance. """ if cls not in cls._instances: - instance = super().__call__(*args, **kwargs) # pylint: disable=no-member + instance = super().__call__(*args, **kwargs) # type: ignore # pylint: disable=no-member cls._instances[cls] = instance return cls._instances[cls] @@ -247,14 +249,17 @@ def get_component_node_validation_rules( # Parse the validation rules per component if applicable if node_validation_rules and isinstance(node_validation_rules, dict): - node_validation_rules = extract_component_validation_rules( + node_validation_rules_list = extract_component_validation_rules( manifest_component=manifest_component, - validation_rules_dict=node_validation_rules, + validation_rules_dict=node_validation_rules, # type: ignore ) - return node_validation_rules + else: + assert isinstance(node_validation_rules, list) + node_validation_rules_list = node_validation_rules + return node_validation_rules_list def get_component_node_required( - self, manifest_component, node_display_name + self, manifest_component: str, node_display_name: str ) -> bool: """Check if a node is required taking into account the manifest component it is defined in (requirements can be set in validaiton rule as well as required column) @@ -802,7 +807,9 @@ def find_child_classes(self, schema_class: str) -> list: Returns: list of children to the schema_class. """ - return unlist(list(self.graph.successors(schema_class))) + child_classes = unlist(list(self.graph.successors(schema_class))) + assert isinstance(child_classes, list) + return child_classes def find_class_specific_properties(self, schema_class: str) -> list[str]: """Find properties specifically associated with a given class diff --git a/schematic/schemas/data_model_json_schema.py b/schematic/schemas/data_model_json_schema.py index 1d7c98a30..97ba8e689 100644 --- a/schematic/schemas/data_model_json_schema.py +++ b/schematic/schemas/data_model_json_schema.py @@ -73,7 +73,7 @@ def get_non_blank_schema( return non_blank_schema def get_range_schema( - self, node_range: list[str], node_name: str, blank=False + self, node_range: list[str], node_name: str, blank: bool = False ) -> dict[str, dict[str, list[str]]]: """ Add a list of nodes to the "enum" key in a given JSON schema object. @@ -225,7 +225,7 @@ def get_json_validation_schema( # otherwise, by default allow any values schema_valid_vals = {node_display_name: {}} - json_schema["properties"].update(schema_valid_vals) + json_schema["properties"].update(schema_valid_vals) # type: ignore # set schema conditional dependencies for node in reverse_dependencies[node_display_name]: @@ -287,7 +287,7 @@ def get_json_validation_schema( } # update conditional-dependency rules in json schema - json_schema["allOf"].append( + json_schema["allOf"].append( # type: ignore schema_conditional_dependencies ) @@ -315,9 +315,9 @@ def get_json_validation_schema( node_name=node_display_name ) - json_schema["properties"].update(schema_valid_vals) + json_schema["properties"].update(schema_valid_vals) # type: ignore # add node to required fields - json_schema["required"] += [node_display_name] + json_schema["required"] += [node_display_name] # type: ignore elif process_node in root_dependencies: # node doesn't have conditionals and is not required; it belongs in the @@ -341,7 +341,7 @@ def get_json_validation_schema( else: schema_valid_vals = {node_display_name: {}} - json_schema["properties"].update(schema_valid_vals) + json_schema["properties"].update(schema_valid_vals) # type: ignore else: # node doesn't have conditionals and it is not required and it @@ -394,4 +394,4 @@ def get_json_validation_schema( prefix_root, prefix_ext = os.path.splitext(prefix) if prefix_ext == ".model": prefix = prefix_root - return json_schema + return json_schema # type: ignore diff --git a/schematic/schemas/data_model_jsonld.py b/schematic/schemas/data_model_jsonld.py index 58f86cfe6..b6cecd82e 100644 --- a/schematic/schemas/data_model_jsonld.py +++ b/schematic/schemas/data_model_jsonld.py @@ -3,10 +3,12 @@ import json import logging import copy +from typing import Union + from dataclasses import dataclass, field from dataclasses_json import config, dataclass_json -import networkx as nx +import networkx as nx # type: ignore from schematic.schemas.data_model_graph import DataModelGraphExplorer from schematic.schemas.data_model_relationships import DataModelRelationships @@ -125,17 +127,17 @@ def __init__(self, graph: nx.MultiDiGraph, output_path: str = ""): # Gather the templates base_template = BaseTemplate() self.base_jsonld_template = json.loads( - base_template.to_json() # pylint:disable=no-member + base_template.to_json() # type: ignore # pylint:disable=no-member ) property_template = PropertyTemplate() self.property_template = json.loads( - property_template.to_json() # pylint:disable=no-member + property_template.to_json() # type: ignore # pylint:disable=no-member ) class_template = ClassTemplate() self.class_template = json.loads( - class_template.to_json() # pylint:disable=no-member + class_template.to_json() # type: ignore # pylint:disable=no-member ) def get_edges_associated_with_node( @@ -177,9 +179,9 @@ def get_edges_associated_with_property_nodes( node_edges.append((node, node_2, edge_dict[edge_key])) return node_edges - def add_edge_rels_to_template( + def add_edge_rels_to_template( # pylint:disable=too-many-branches self, template: dict, rel_vals: dict, node: str - ): # pylint:disable=too-many-branches + ) -> dict: """ Args: template, dict: single class or property JSONLD template that is in the process of being @@ -271,7 +273,9 @@ def add_edge_rels_to_template( ) return template - def add_node_info_to_template(self, template, rel_vals, node): + def add_node_info_to_template( + self, template: dict, rel_vals: dict, node: str + ) -> dict: """For a given node and relationship, add relevant value to template Args: template, dict: single class or property JSONLD template that is in the process @@ -473,7 +477,9 @@ def reorder_template_entries(self, template: dict) -> dict: ) edge_weights_dict = {edge: i for i, edge in enumerate(sorted_edges)} - ordered_edges = [0] * len(edge_weights_dict.keys()) + ordered_edges: list[Union[int, dict]] = [0] * len( + edge_weights_dict.keys() + ) for edge, normalized_weight in edge_weights_dict.items(): ordered_edges[normalized_weight] = {"@id": "bts:" + edge} @@ -488,7 +494,7 @@ def reorder_template_entries(self, template: dict) -> dict: template[jsonld_key] = ordered_edges return template - def generate_jsonld_object(self): + def generate_jsonld_object(self) -> dict: """Create the JSONLD object. Returns: jsonld_object, dict: JSONLD object containing all nodes and related information @@ -513,7 +519,7 @@ def generate_jsonld_object(self): return json_ld_template -def convert_graph_to_jsonld(graph): +def convert_graph_to_jsonld(graph) -> dict: """convert graph to jsonld""" # Make the JSONLD object data_model_jsonld_converter = DataModelJsonLD(graph=graph) diff --git a/schematic/schemas/data_model_relationships.py b/schematic/schemas/data_model_relationships.py index f83de5d8e..c222285da 100644 --- a/schematic/schemas/data_model_relationships.py +++ b/schematic/schemas/data_model_relationships.py @@ -192,7 +192,7 @@ def define_data_model_relationships(self) -> dict: return map_data_model_relationships - def define_required_csv_headers(self): + def define_required_csv_headers(self) -> list: """ Helper function to retrieve required CSV headers, alert if required header was not provided. diff --git a/schematic/schemas/data_model_validator.py b/schematic/schemas/data_model_validator.py index 897c33738..ca3b7e31b 100644 --- a/schematic/schemas/data_model_validator.py +++ b/schematic/schemas/data_model_validator.py @@ -85,7 +85,7 @@ def check_graph_has_required_node_fields(self) -> list[str]: ) return error - def run_cycles(self): + def run_cycles(self) -> None: """run_cycles""" cycles = nx.simple_cycles(self.graph) if cycles: # pylint:disable=using-constant-test @@ -188,27 +188,27 @@ def check_reserved_names(self) -> list[str]: ) return error - def check_namespace_overlap(self): + def check_namespace_overlap(self) -> list: """ Check if name is repeated. Implement in the future """ - warning = [] + warning: list = [] return warning - def check_for_orphan_attributes(self): + def check_for_orphan_attributes(self) -> list: """ Check if attribute is specified but not connected to another attribute or component. Implement in future """ - warning = [] + warning: list = [] return warning - def check_namespace_similarity(self): + def check_namespace_similarity(self) -> list: """ Using AI, check if submitted attributes or valid values are similar to other ones, warn users. Implement in future """ - warning = [] + warning: list = [] return warning From e2e5b7fec304fdc79d3fb6b3a4292b91c70e2547 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Apr 2024 09:25:57 -0700 Subject: [PATCH 002/110] mypy formatted schema files --- schematic/schemas/commands.py | 2 +- schematic/schemas/json_schema_validator.py | 51 ++++++++++++---------- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/schematic/schemas/commands.py b/schematic/schemas/commands.py index 80583734b..7e8961761 100644 --- a/schematic/schemas/commands.py +++ b/schematic/schemas/commands.py @@ -59,7 +59,7 @@ def schema() -> None: # use as `schematic model ...` metavar="", help=query_dict(schema_commands, ("schema", "convert", "output_jsonld")), ) -def convert(schema, data_model_labels: DisplayLabelType, output_jsonld: Optional[str]): +def convert(schema, data_model_labels: DisplayLabelType, output_jsonld: Optional[str]) -> None: """ Running CLI to convert data model specification in CSV format to data model in JSON-LD format. diff --git a/schematic/schemas/json_schema_validator.py b/schematic/schemas/json_schema_validator.py index 7369c1f45..0a1760c79 100644 --- a/schematic/schemas/json_schema_validator.py +++ b/schematic/schemas/json_schema_validator.py @@ -1,6 +1,8 @@ """Json Schema Validator""" import os +from typing import Any + from jsonschema import validate from schematic.utils.io_utils import load_schemaorg, load_json @@ -46,12 +48,13 @@ class SchemaValidator: """ - def __init__(self, schema): + def __init__(self, schema: Any) -> None: self.schemaorg = {"schema": load_schemaorg(), "classes": [], "properties": []} for _schema in self.schemaorg["schema"]["@graph"]: for _record in _schema["@graph"]: if "@type" in _record: _type = str2list(_record["@type"]) + assert isinstance(_type, list) if "rdfs:Property" in _type: self.schemaorg["properties"].append(_record["@id"]) elif "rdfs:Class" in _type: @@ -61,70 +64,74 @@ def __init__(self, schema): "classes": [], "properties": [], } - for _record in self.extension_schema["schema"]["@graph"]: + for _record in self.extension_schema["schema"]["@graph"]: # type: ignore _type = str2list(_record["@type"]) + assert isinstance(_type, list) if "rdfs:Property" in _type: - self.extension_schema["properties"].append(_record["@id"]) + self.extension_schema["properties"].append(_record["@id"]) # type: ignore elif "rdfs:Class" in _type: - self.extension_schema["classes"].append(_record["@id"]) + self.extension_schema["classes"].append(_record["@id"]) # type: ignore self.all_classes = self.schemaorg["classes"] + self.extension_schema["classes"] - def validate_class_label(self, label_uri): + def validate_class_label(self, label_uri: str) -> None: """Check if the first character of class label is capitalized""" label = extract_name_from_uri_or_curie(label_uri) assert label[0].isupper() - def validate_property_label(self, label_uri): + def validate_property_label(self, label_uri: str) -> None: """Check if the first character of property label is lower case""" label = extract_name_from_uri_or_curie(label_uri) assert label[0].islower() - def validate_subclassof_field(self, subclassof_value): + def validate_subclassof_field(self, subclassof_value: dict) -> None: """Check if the value of "subclassof" is included in the schema file""" - subclassof_value = dict2list(subclassof_value) + subclassof_value_list = dict2list(subclassof_value) + assert isinstance(subclassof_value_list, list) for record in subclassof_value: assert record["@id"] in self.all_classes - def validate_domain_includes_field(self, domainincludes_value): + def validate_domain_includes_field(self, domainincludes_value: dict) -> None: """Check if the value of "domainincludes" is included in the schema file """ - domainincludes_value = dict2list(domainincludes_value) - for record in domainincludes_value: + domainincludes_value_list = dict2list(domainincludes_value) + assert isinstance(domainincludes_value_list, list) + for record in domainincludes_value_list: assert ( record["@id"] in self.all_classes ), f"value of domainincludes not recorded in schema: {domainincludes_value}" - def validate_range_includes_field(self, rangeincludes_value): + def validate_range_includes_field(self, rangeincludes_value: dict) -> None: """Check if the value of "rangeincludes" is included in the schema file """ - rangeincludes_value = dict2list(rangeincludes_value) - for record in rangeincludes_value: + rangeincludes_value_list = dict2list(rangeincludes_value) + assert isinstance(rangeincludes_value_list, list) + for record in rangeincludes_value_list: assert record["@id"] in self.all_classes - def check_whether_atid_and_label_match(self, record): + def check_whether_atid_and_label_match(self, record: dict) -> None: """Check if @id field matches with the "rdfs:label" field""" _id = extract_name_from_uri_or_curie(record["@id"]) assert _id == record["rdfs:label"], f"id and label not match: {record}" - def check_duplicate_labels(self): + def check_duplicate_labels(self) -> None: """Check for duplication in the schema""" labels = [ _record["rdfs:label"] - for _record in self.extension_schema["schema"]["@graph"] + for _record in self.extension_schema["schema"]["@graph"] # type: ignore ] duplicates = find_duplicates(labels) if len(duplicates) == 0: raise ValueError("Duplicates detected in graph: ", duplicates) - def validate_schema(self, schema): + def validate_schema(self, schema: Any) -> None: """Validate schema against SchemaORG standard""" json_schema_path = os.path.join("validation_schemas", "schema.json") json_schema = load_json(json_schema_path) return validate(schema, json_schema) - def validate_property_schema(self, schema): + def validate_property_schema(self, schema: Any) -> None: """Validate schema against SchemaORG property definition standard""" json_schema_path = os.path.join( "validation_schemas", "property_json_schema.json" @@ -132,16 +139,16 @@ def validate_property_schema(self, schema): json_schema = load_json(json_schema_path) return validate(schema, json_schema) - def validate_class_schema(self, schema): + def validate_class_schema(self, schema: Any) -> None: """Validate schema against SchemaORG class definition standard""" json_schema_path = os.path.join("validation_schemas", "class_json_schema.json") json_schema = load_json(json_schema_path) return validate(schema, json_schema) - def validate_full_schema(self): + def validate_full_schema(self) -> None: """validate full schema""" self.check_duplicate_labels() - for record in self.extension_schema["schema"]["@graph"]: + for record in self.extension_schema["schema"]["@graph"]: # type: ignore self.check_whether_atid_and_label_match(record) if record["@type"] == "rdf:Class": self.validate_class_schema(record) From 8597841b99543879b184f582b9f6573d5c5c9014 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Apr 2024 09:35:26 -0700 Subject: [PATCH 003/110] mypy formatted schema files --- schematic/schemas/commands.py | 4 ++-- schematic/schemas/data_model_nodes.py | 16 +++++++++------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/schematic/schemas/commands.py b/schematic/schemas/commands.py index 7e8961761..9ff313a96 100644 --- a/schematic/schemas/commands.py +++ b/schematic/schemas/commands.py @@ -3,7 +3,7 @@ import logging import time import re -from typing import get_args, Optional +from typing import get_args, Optional, Any import click import click_log # type: ignore @@ -59,7 +59,7 @@ def schema() -> None: # use as `schematic model ...` metavar="", help=query_dict(schema_commands, ("schema", "convert", "output_jsonld")), ) -def convert(schema, data_model_labels: DisplayLabelType, output_jsonld: Optional[str]) -> None: +def convert(schema: Any, data_model_labels: DisplayLabelType, output_jsonld: Optional[str]) -> None: """ Running CLI to convert data model specification in CSV format to data model in JSON-LD format. diff --git a/schematic/schemas/data_model_nodes.py b/schematic/schemas/data_model_nodes.py index 24de9cf37..bdd2f8480 100644 --- a/schematic/schemas/data_model_nodes.py +++ b/schematic/schemas/data_model_nodes.py @@ -1,6 +1,6 @@ """Data model Nodes""" -from typing import Optional, Callable +from typing import Optional, Callable, Any from inspect import isfunction import networkx as nx # type: ignore @@ -20,7 +20,7 @@ class DataModelNodes: """Data model Nodes""" - def __init__(self, attribute_relationships_dict): + def __init__(self, attribute_relationships_dict: dict): self.namespaces = { "rdf": Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#") } @@ -141,11 +141,11 @@ def run_rel_functions( rel_func: Callable, node_display_name: str = "", key: str = "", - attr_relationships=None, - csv_header="", + attr_relationships: Optional[dict] = None, + csv_header: str = "", entry_type="", data_model_labels: DisplayLabelType = "class_label", - ): + ) -> Any: """ This function exists to centralzie handling of functions for filling out node information, makes sure all the proper parameters are passed to each function. @@ -259,7 +259,9 @@ def generate_node_dict( # Look through relationship types that represent values (i.e. do not define edges) for key, csv_header in self.value_relationships.items(): # Get key and defalt values current relationship type. - rel_key, rel_node_dict = self.get_rel_node_dict_info(key) + rel_node = self.get_rel_node_dict_info(key) + assert rel_node is not None + rel_key, rel_node_dict = rel_node # If we have information to add about this particular node, get it if csv_header in attr_relationships.keys(): @@ -322,6 +324,6 @@ def generate_node(self, graph: nx.MultiDiGraph, node_dict: dict) -> nx.MultiDiGr graph.add_node(node_dict["label"], **node_dict) return graph - def edit_node(self): + def edit_node(self) -> None: """Stub for future node editor.""" return From be21a91dcc25628cfbe059bdd7eb87467c06ab7d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Apr 2024 09:46:51 -0700 Subject: [PATCH 004/110] mypy schema file --- schematic/schemas/data_model_nodes.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/schematic/schemas/data_model_nodes.py b/schematic/schemas/data_model_nodes.py index bdd2f8480..27431bbd6 100644 --- a/schematic/schemas/data_model_nodes.py +++ b/schematic/schemas/data_model_nodes.py @@ -14,6 +14,7 @@ convert_bool_to_str, parse_validation_rules, DisplayLabelType, + EntryType ) @@ -123,15 +124,17 @@ def get_data_model_properties(self, attr_rel_dict: dict) -> list: properties = list(set(properties)) return properties - def get_entry_type(self, node_display_name: str) -> str: + def get_entry_type(self, node_display_name: str) -> EntryType: """Get the entry type of the node, property or class. + Args: - node_display_name, str: display name of target node. + node_display_name (str): display name of target node. + Returns: - entry_type, str: returns 'property' or 'class' based on data model specifications. + EntryType: returns 'property' or 'class' based on data model specifications. """ if node_display_name in self.properties: - entry_type = "property" + entry_type:EntryType = "property" else: entry_type = "class" return entry_type @@ -143,7 +146,7 @@ def run_rel_functions( key: str = "", attr_relationships: Optional[dict] = None, csv_header: str = "", - entry_type="", + entry_type:EntryType="class", data_model_labels: DisplayLabelType = "class_label", ) -> Any: """ From bb9f873fefa0ff59a24720ed3b7165b3a8c32e2f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Apr 2024 09:58:40 -0700 Subject: [PATCH 005/110] mypy format schema files --- schematic/schemas/commands.py | 4 +++- schematic/schemas/data_model_jsonld.py | 2 +- schematic/schemas/data_model_nodes.py | 6 +++--- schematic/schemas/data_model_parser.py | 18 +++++++++--------- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/schematic/schemas/commands.py b/schematic/schemas/commands.py index 9ff313a96..5b143f640 100644 --- a/schematic/schemas/commands.py +++ b/schematic/schemas/commands.py @@ -59,7 +59,9 @@ def schema() -> None: # use as `schematic model ...` metavar="", help=query_dict(schema_commands, ("schema", "convert", "output_jsonld")), ) -def convert(schema: Any, data_model_labels: DisplayLabelType, output_jsonld: Optional[str]) -> None: +def convert( + schema: Any, data_model_labels: DisplayLabelType, output_jsonld: Optional[str] +) -> None: """ Running CLI to convert data model specification in CSV format to data model in JSON-LD format. diff --git a/schematic/schemas/data_model_jsonld.py b/schematic/schemas/data_model_jsonld.py index b6cecd82e..0a2464ecb 100644 --- a/schematic/schemas/data_model_jsonld.py +++ b/schematic/schemas/data_model_jsonld.py @@ -519,7 +519,7 @@ def generate_jsonld_object(self) -> dict: return json_ld_template -def convert_graph_to_jsonld(graph) -> dict: +def convert_graph_to_jsonld(graph: nx.MultiDiGraph) -> dict: """convert graph to jsonld""" # Make the JSONLD object data_model_jsonld_converter = DataModelJsonLD(graph=graph) diff --git a/schematic/schemas/data_model_nodes.py b/schematic/schemas/data_model_nodes.py index 27431bbd6..a8ea4fd09 100644 --- a/schematic/schemas/data_model_nodes.py +++ b/schematic/schemas/data_model_nodes.py @@ -14,7 +14,7 @@ convert_bool_to_str, parse_validation_rules, DisplayLabelType, - EntryType + EntryType, ) @@ -134,7 +134,7 @@ def get_entry_type(self, node_display_name: str) -> EntryType: EntryType: returns 'property' or 'class' based on data model specifications. """ if node_display_name in self.properties: - entry_type:EntryType = "property" + entry_type: EntryType = "property" else: entry_type = "class" return entry_type @@ -146,7 +146,7 @@ def run_rel_functions( key: str = "", attr_relationships: Optional[dict] = None, csv_header: str = "", - entry_type:EntryType="class", + entry_type: EntryType = "class", data_model_labels: DisplayLabelType = "class_label", ) -> Any: """ diff --git a/schematic/schemas/data_model_parser.py b/schematic/schemas/data_model_parser.py index 2ddda42bd..094123859 100644 --- a/schematic/schemas/data_model_parser.py +++ b/schematic/schemas/data_model_parser.py @@ -119,7 +119,7 @@ def parse_model(self) -> dict[str, dict[str, Any]]: class DataModelCSVParser: """DataModelCSVParser""" - def __init__(self): + def __init__(self) -> None: # Instantiate DataModelRelationships self.dmr = DataModelRelationships() # Load relationships dictionary. @@ -233,7 +233,7 @@ def gather_csv_attributes_relationships( def parse_csv_model( self, path_to_data_model: str, - ): + ) -> dict[str, dict[str, Any]]: """Load csv data model and parse into an attributes:relationships dictionary Args: path_to_data_model, str: path to data model @@ -256,7 +256,7 @@ class DataModelJSONLDParser: def __init__( self, - ): + ) -> None: # Instantiate DataModelRelationships self.dmr = DataModelRelationships() # Load relationships dictionary. @@ -283,7 +283,7 @@ def parse_jsonld_dicts( # Retrieve ID from a dictionary recording the ID if set(rel_entry.keys()) == {"@id"}: - parsed_rel_entry = rel_entry["@id"] + parsed_rel_entry: Union[str, dict[str, str]] = rel_entry["@id"] # Parse any remaining dictionaries else: parsed_rel_entry = rel_entry @@ -309,7 +309,7 @@ def parse_entry( """ # Parse dictionary entries if isinstance(rel_entry, dict): - parsed_rel_entry = self.parse_jsonld_dicts(rel_entry) + parsed_rel_entry: Any = self.parse_jsonld_dicts(rel_entry) # Parse list of dictionaries to make a list of entries with context stripped (will update # this section when contexts added.) @@ -338,7 +338,7 @@ def parse_entry( return parsed_rel_entry - def label_to_dn_dict(self, model_jsonld: list[dict]): + def label_to_dn_dict(self, model_jsonld: list[dict]) -> dict: """ Generate a dictionary of labels to display name, so can easily look up display names using the label. @@ -358,7 +358,7 @@ def label_to_dn_dict(self, model_jsonld: list[dict]): def convert_entry_to_dn_label( self, parsed_rel_entry: Union[str, list], model_jsonld: list[dict] - ) -> Union[str, list]: + ) -> Union[str, list, None]: """Convert a parsed entry to display name, taking into account the entry type Args: parsed_rel_entry: an entry that has been parsed base on its input type @@ -460,7 +460,7 @@ def gather_jsonld_attributes_relationships(self, model_jsonld: list[dict]) -> di for parsed_val in parsed_rel_entry: attr_in_dict = False # Get propert/parent key (displayName) - p_attr_key = "" + p_attr_key: Any = "" # Check if the parsed value is already a part of the # attr_rel_dictionary for attr_dn in attr_rel_dictionary: @@ -530,7 +530,7 @@ def gather_jsonld_attributes_relationships(self, model_jsonld: list[dict]) -> di def parse_jsonld_model( self, path_to_data_model: str, - ): + ) -> dict: """Convert raw JSONLD data model to attributes relationship dictionary. Args: path_to_data_model: str, path to JSONLD data model From cc502746033992e966541b84fa3d59e1b46e6381 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Apr 2024 10:04:25 -0700 Subject: [PATCH 006/110] mypy formatted schema file --- schematic/schemas/data_model_parser.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/schematic/schemas/data_model_parser.py b/schematic/schemas/data_model_parser.py index 094123859..c023f0e5c 100644 --- a/schematic/schemas/data_model_parser.py +++ b/schematic/schemas/data_model_parser.py @@ -372,15 +372,17 @@ def convert_entry_to_dn_label( dn_label_dict = self.label_to_dn_dict(model_jsonld=model_jsonld) # Handle if using the display name as the label if isinstance(parsed_rel_entry, list): - parsed_rel_entry = [ + dn_label: Union[str, list, None] = [ dn_label_dict.get(entry) if dn_label_dict.get(entry) else entry for entry in parsed_rel_entry ] elif isinstance(parsed_rel_entry, str): converted_label = dn_label_dict.get(parsed_rel_entry) if converted_label: - parsed_rel_entry = dn_label_dict.get(parsed_rel_entry) - return parsed_rel_entry + dn_label = dn_label_dict.get(parsed_rel_entry) + else: + dn_label = parsed_rel_entry + return dn_label def gather_jsonld_attributes_relationships(self, model_jsonld: list[dict]) -> dict: """ From 69dfa08f4720dfffa77b95062575a6cfd951423a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Apr 2024 10:05:50 -0700 Subject: [PATCH 007/110] add schema module to mypy github workfow --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0663698e6..b9e719dda 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -101,7 +101,7 @@ jobs: # add here when checked # poetry run mypy --install-types --non-interactive # add here when enforced - poetry run mypy --disallow-untyped-defs --install-types --non-interactive schematic/configuration/*.py schematic/exceptions.py schematic/help.py schematic/loader.py schematic/version.py schematic/visualization schematic/utils/ + poetry run mypy --disallow-untyped-defs --install-types --non-interactive schematic/schemas/ schematic/configuration/ schematic/exceptions.py schematic/help.py schematic/loader.py schematic/version.py schematic/visualization schematic/utils/ #---------------------------------------------- # linting From d72407b2af7ffabf720aab447bdf3fae546cec90 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Apr 2024 13:23:10 -0700 Subject: [PATCH 008/110] lint test file --- tests/test_store.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index c2038db76..a90529657 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -214,7 +214,7 @@ def test_get_file_entityIds(self, helpers, synapse_store, only_new_files): assert len(files_and_Ids["entityId"]) == 2 @pytest.mark.parametrize( - "manifest_path, test_annotations, datasetId, manifest_record_type", + "manifest_path, test_annotations, dataset_id, manifest_record_type", [ ( "mock_manifests/annotations_test_manifest.csv", @@ -233,19 +233,19 @@ def test_get_file_entityIds(self, helpers, synapse_store, only_new_files): ) def test_annotation_submission( self, - synapse_store, + synapse_store: SynapseStorage, helpers, - manifest_path, - test_annotations, - datasetId, - manifest_record_type, - config: Configuration, + manifest_path: str, + test_annotations: dict[str, str], + dataset_id: str, + manifest_record_type: str, dmge: DataModelGraphExplorer, ): - manifest_id = synapse_store.associateMetadataWithFiles( + """Test annotation submission""" + synapse_store.associateMetadataWithFiles( dmge=dmge, metadataManifestPath=helpers.get_data_path(manifest_path), - datasetId=datasetId, + datasetId=dataset_id, manifest_record_type=manifest_record_type, hideBlanks=True, restrict_manifest=False, @@ -264,7 +264,7 @@ def test_annotation_submission( assert "CheckRecommended" not in annotations.keys() elif manifest_path.endswith("test_BulkRNAseq.csv"): entity = synapse_store.syn.get(entity_id) - assert type(entity) == File + assert isinstance(entity, File) @pytest.mark.parametrize("force_batch", [True, False], ids=["batch", "non_batch"]) def test_getDatasetAnnotations(self, dataset_id, synapse_store, force_batch): From c0df0e8b848f143dcd6eefb8bc8972645a7b2f16 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Apr 2024 13:37:40 -0700 Subject: [PATCH 009/110] create copies of manifests to alter for test --- tests/test_store.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index a90529657..96af464f4 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -242,9 +242,14 @@ def test_annotation_submission( dmge: DataModelGraphExplorer, ): """Test annotation submission""" + # Make copy of manifest file in case columns are added to it + full_manifest_path = helpers.get_data_path(manifest_path) + copy_path = f"{full_manifest_path}.copy" + shutil.copyfile(full_manifest_path, copy_path) + synapse_store.associateMetadataWithFiles( dmge=dmge, - metadataManifestPath=helpers.get_data_path(manifest_path), + metadataManifestPath=copy_path, datasetId=dataset_id, manifest_record_type=manifest_record_type, hideBlanks=True, @@ -252,9 +257,12 @@ def test_annotation_submission( ) # Retrive annotations - entity_id = helpers.get_data_frame(manifest_path)["entityId"][0] + entity_id = helpers.get_data_frame(copy_path)["entityId"][0] annotations = synapse_store.getFileAnnotations(entity_id) + # remove file copy + os.remove(copy_path) + # Check annotations of interest for key in test_annotations.keys(): assert key in annotations.keys() From 4405afc3e03098c1d1da660e43084a4755502877 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Apr 2024 13:44:48 -0700 Subject: [PATCH 010/110] fixed spacing --- tests/test_store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_store.py b/tests/test_store.py index 96af464f4..a962667a9 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -490,6 +490,7 @@ def test_get_files_metadata_from_dataset(self, synapse_store): "entityId": ["syn123", "syn456"], } + class TestDatasetFileView: def test_init(self, dataset_id, dataset_fileview, synapse_store): assert dataset_fileview.datasetId == dataset_id From e27028641db90ae42fb4e3f108acfea4e13cdd10 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Wed, 17 Apr 2024 16:02:38 -0700 Subject: [PATCH 011/110] add back code to automatically save the JSON Validation schema --- schematic/schemas/data_model_json_schema.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/schematic/schemas/data_model_json_schema.py b/schematic/schemas/data_model_json_schema.py index 8f50390e4..7b12becd9 100644 --- a/schematic/schemas/data_model_json_schema.py +++ b/schematic/schemas/data_model_json_schema.py @@ -1,5 +1,6 @@ "Data Model Json Schema" +import json import logging import os from typing import Any, Optional @@ -396,4 +397,17 @@ def get_json_validation_schema( prefix_root, prefix_ext = os.path.splitext(prefix) if prefix_ext == ".model": prefix = prefix_root + json_schema_log_file = f"{prefix}.{source_node}.schema.json" + + if json_schema_log_file is None: + logger.info( + "The JSON schema file can be inspected by setting the following " + "nested key in the configuration: (model > location)." + ) + else: + json_schema_dirname = os.path.dirname(json_schema_log_file) + if json_schema_dirname != '': + os.makedirs(json_schema_dirname, exist_ok=True) + with open(json_schema_log_file, "w") as js_f: + json.dump(json_schema, js_f, indent=2) return json_schema From 12ee6cc4c5a1900517a56cbb7c3b348817d16b22 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 18 Apr 2024 08:21:15 -0700 Subject: [PATCH 012/110] move fixture from test file to conftest --- tests/conftest.py | 31 +++++++++++++++++++++++++------ tests/test_metadata.py | 27 +++------------------------ 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5e19933b3..aaf0440ed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,18 +1,16 @@ -from multiprocessing.sharedctypes import Value +"""Fixtures and helpers for use across all tests""" import os import logging import sys +from typing import Generator +from pathlib import Path import shutil import pytest -import pandas as pd -from dotenv import load_dotenv, find_dotenv -from time import perf_counter +from dotenv import load_dotenv from schematic.schemas.data_model_parser import DataModelParser from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer -from schematic.schemas.data_model_nodes import DataModelNodes -from schematic.schemas.data_model_json_schema import DataModelJSONSchema from schematic.configuration.configuration import CONFIG from schematic.utils.df_utils import load_df @@ -128,3 +126,24 @@ def synapse_store(request): synapse_store = SynapseStorage() yield synapse_store + +@pytest.fixture +def test_bulkrnaseq(helpers: Helpers) -> Generator[Path, None, None]: + """create temporary copy of test_BulkRNAseq.csv + This fixture creates a temporary copy of the original 'test_BulkRNAseq.csv' file + After test, the copied file is removed. + Args: + helpers (Helpers): Helpers fixture + + Yields: + Generator[Path, None, None]: temporary file path of the copied version test_BulkRNAseq.csv + """ + # original bulkrnaseq csv + original_test_path = helpers.get_data_path("mock_manifests/test_BulkRNAseq.csv") + # Copy the original CSV file to a temporary directory + temp_csv_path = helpers.get_data_path("mock_manifests/test_BulkRNAseq2.csv") + shutil.copyfile(original_test_path, temp_csv_path) + yield temp_csv_path + # Teardown + if os.path.exists(temp_csv_path): + os.remove(temp_csv_path) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 7c1659ae9..11da5699c 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,7 +1,8 @@ +"""Tests for Metada class""" + import logging import os -import shutil -from typing import Optional, Generator +from typing import Optional from pathlib import Path from unittest.mock import patch @@ -24,28 +25,6 @@ def metadata_model(helpers, data_model_labels): return metadata_model -@pytest.fixture -def test_bulkrnaseq(helpers: Helpers) -> Generator[Path, None, None]: - """create temporary copy of test_BulkRNAseq.csv - This fixture creates a temporary copy of the original 'test_BulkRNAseq.csv' file - After test, the copied file is removed. - Args: - helpers (Helpers): Helpers fixture - - Yields: - Generator[Path, None, None]: temporary file path of the copied version test_BulkRNAseq.csv - """ - # original bulkrnaseq csv - original_test_path = helpers.get_data_path("mock_manifests/test_BulkRNAseq.csv") - # Copy the original CSV file to a temporary directory - temp_csv_path = helpers.get_data_path("mock_manifests/test_BulkRNAseq2.csv") - shutil.copyfile(original_test_path, temp_csv_path) - yield temp_csv_path - # Teardown - if os.path.exists(temp_csv_path): - os.remove(temp_csv_path) - - class TestMetadataModel: @pytest.mark.parametrize("as_graph", [True, False], ids=["as_graph", "as_list"]) @pytest.mark.parametrize( From b66e4fbefc658da3f8a602afa1018351cf0f8825 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 18 Apr 2024 08:37:38 -0700 Subject: [PATCH 013/110] test not uses fixtures instead of making its own copies of manifests --- tests/conftest.py | 30 +++++++++++++++++++++++++++--- tests/test_store.py | 23 +++++++++++++---------- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index aaf0440ed..e093e3ad9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,6 @@ import logging import sys from typing import Generator -from pathlib import Path import shutil import pytest @@ -127,8 +126,13 @@ def synapse_store(request): yield synapse_store -@pytest.fixture -def test_bulkrnaseq(helpers: Helpers) -> Generator[Path, None, None]: + +# These fixtures make copies of existing test manifests. +# These copies can the be altered by a given test, and the copy will eb destroyed at the +# end of the test + +@pytest.fixture(scope="function") +def test_bulkrnaseq(helpers: Helpers) -> Generator[str, None, None]: """create temporary copy of test_BulkRNAseq.csv This fixture creates a temporary copy of the original 'test_BulkRNAseq.csv' file After test, the copied file is removed. @@ -147,3 +151,23 @@ def test_bulkrnaseq(helpers: Helpers) -> Generator[Path, None, None]: # Teardown if os.path.exists(temp_csv_path): os.remove(temp_csv_path) + + +@pytest.fixture(scope="function") +def test_annotations_manifest(helpers: Helpers) -> Generator[str, None, None]: + """ + Create temporary copy of annotations_test_manifest.csv + This fixture creates a temporary copy of the original 'test_BulkRNAseq.csv' file + After test, the copied file is removed. + Args: + helpers (Helpers): Helpers fixture + + Yields: + Generator[Path, None, None]: temporary file path of the copied manifest + """ + original_test_path = helpers.get_data_path("mock_manifests/annotations_test_manifest.csv") + temp_csv_path = helpers.get_data_path("mock_manifests/annotations_test_manifest2.csv") + shutil.copyfile(original_test_path, temp_csv_path) + yield temp_csv_path + if os.path.exists(temp_csv_path): + os.remove(temp_csv_path) diff --git a/tests/test_store.py b/tests/test_store.py index a962667a9..4b1ae6eb3 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -214,16 +214,16 @@ def test_get_file_entityIds(self, helpers, synapse_store, only_new_files): assert len(files_and_Ids["entityId"]) == 2 @pytest.mark.parametrize( - "manifest_path, test_annotations, dataset_id, manifest_record_type", + "manifest, test_annotations, dataset_id, manifest_record_type", [ ( - "mock_manifests/annotations_test_manifest.csv", + "annotations_manifest", {"CheckInt": "7", "CheckList": "valid, list, values"}, "syn34295552", "file_and_entities", ), ( - "mock_manifests/test_BulkRNAseq.csv", + "bulk_rna_seq_manifest", {"FileFormat": "BAM", "GenomeBuild": "GRCh38"}, "syn39241199", "table_and_file", @@ -235,17 +235,20 @@ def test_annotation_submission( self, synapse_store: SynapseStorage, helpers, - manifest_path: str, + manifest: str, test_annotations: dict[str, str], dataset_id: str, manifest_record_type: str, dmge: DataModelGraphExplorer, + test_bulkrnaseq: str, + test_annotations_manifest: str, ): """Test annotation submission""" - # Make copy of manifest file in case columns are added to it - full_manifest_path = helpers.get_data_path(manifest_path) - copy_path = f"{full_manifest_path}.copy" - shutil.copyfile(full_manifest_path, copy_path) + if manifest == "annotations_manifest": + copy_path = test_annotations_manifest + else: + copy_path = test_bulkrnaseq + synapse_store.associateMetadataWithFiles( dmge=dmge, @@ -268,9 +271,9 @@ def test_annotation_submission( assert key in annotations.keys() assert annotations[key] == test_annotations[key] - if manifest_path.endswith("annotations_test_manifest.csv"): + if copy_path.endswith("annotations_test_manifest.csv"): assert "CheckRecommended" not in annotations.keys() - elif manifest_path.endswith("test_BulkRNAseq.csv"): + elif copy_path.endswith("test_BulkRNAseq.csv"): entity = synapse_store.syn.get(entity_id) assert isinstance(entity, File) From acbe03de548ba830d71be8e4f51e02c5e50f5170 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 18 Apr 2024 08:39:37 -0700 Subject: [PATCH 014/110] fix typo in docstring --- tests/test_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 11da5699c..45e4dd711 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,4 +1,4 @@ -"""Tests for Metada class""" +"""Tests for Metadata class""" import logging import os From f85a05ad38b78fb72743377c07ec2dc78a807b30 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Thu, 18 Apr 2024 09:25:41 -0700 Subject: [PATCH 015/110] run black --- schematic/schemas/data_model_json_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/schemas/data_model_json_schema.py b/schematic/schemas/data_model_json_schema.py index 7b12becd9..ed7688f4c 100644 --- a/schematic/schemas/data_model_json_schema.py +++ b/schematic/schemas/data_model_json_schema.py @@ -406,7 +406,7 @@ def get_json_validation_schema( ) else: json_schema_dirname = os.path.dirname(json_schema_log_file) - if json_schema_dirname != '': + if json_schema_dirname != "": os.makedirs(json_schema_dirname, exist_ok=True) with open(json_schema_log_file, "w") as js_f: json.dump(json_schema, js_f, indent=2) From 1705b4eaf5fff8378b869d880679bbaee7166d5e Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Thu, 18 Apr 2024 11:25:25 -0700 Subject: [PATCH 016/110] add encoding to satisfy pylinting --- schematic/schemas/data_model_json_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/schemas/data_model_json_schema.py b/schematic/schemas/data_model_json_schema.py index ed7688f4c..1a1f88977 100644 --- a/schematic/schemas/data_model_json_schema.py +++ b/schematic/schemas/data_model_json_schema.py @@ -408,6 +408,6 @@ def get_json_validation_schema( json_schema_dirname = os.path.dirname(json_schema_log_file) if json_schema_dirname != "": os.makedirs(json_schema_dirname, exist_ok=True) - with open(json_schema_log_file, "w") as js_f: + with open(json_schema_log_file, "w", encoding="UTF-8") as js_f: json.dump(json_schema, js_f, indent=2) return json_schema From 3fb3254080f9448af12be1cefe9e6d968ce3e99e Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Mon, 22 Apr 2024 10:28:06 -0700 Subject: [PATCH 017/110] move logic to create JSON Schema log file name to a helper, so it can be used by tests --- schematic/schemas/data_model_json_schema.py | 15 +++++---------- schematic/utils/schema_utils.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/schematic/schemas/data_model_json_schema.py b/schematic/schemas/data_model_json_schema.py index 1a1f88977..1b6c0e5ad 100644 --- a/schematic/schemas/data_model_json_schema.py +++ b/schematic/schemas/data_model_json_schema.py @@ -10,6 +10,7 @@ from schematic.schemas.data_model_graph import DataModelGraphExplorer from schematic.schemas.data_model_relationships import DataModelRelationships from schematic.utils.validate_utils import rule_in_rule_list +from schematic.utils.schema_utils import get_json_schema_log_file_name logger = logging.getLogger(__name__) @@ -392,22 +393,16 @@ def get_json_validation_schema( # If no config value and SchemaGenerator was initialized with # a JSON-LD path, construct if self.jsonld_path is not None: - self.jsonld_path_root, _ = os.path.splitext(self.jsonld_path) - prefix = self.jsonld_path_root - prefix_root, prefix_ext = os.path.splitext(prefix) - if prefix_ext == ".model": - prefix = prefix_root - json_schema_log_file = f"{prefix}.{source_node}.schema.json" - - if json_schema_log_file is None: + json_schema_log_file_name = get_json_schema_log_file_name(data_model_path=self.jsonld_path, source_node=source_node) + if json_schema_log_file_name is None: logger.info( "The JSON schema file can be inspected by setting the following " "nested key in the configuration: (model > location)." ) else: - json_schema_dirname = os.path.dirname(json_schema_log_file) + json_schema_dirname = os.path.dirname(json_schema_log_file_name) if json_schema_dirname != "": os.makedirs(json_schema_dirname, exist_ok=True) - with open(json_schema_log_file, "w", encoding="UTF-8") as js_f: + with open(json_schema_log_file_name, "w", encoding="UTF-8") as js_f: json.dump(json_schema, js_f, indent=2) return json_schema diff --git a/schematic/utils/schema_utils.py b/schematic/utils/schema_utils.py index a4565d420..61a857e30 100644 --- a/schematic/utils/schema_utils.py +++ b/schematic/utils/schema_utils.py @@ -4,6 +4,7 @@ import json import logging +import os import string from typing import Literal, Union, Optional @@ -482,3 +483,12 @@ def strip_context(context_value: str) -> tuple[str, str]: elif "@" in context_value: context, value = context_value.split("@") return context, value + +def get_json_schema_log_file_name(data_model_path: str, source_node: str) -> str: + data_model_path_root, _ = os.path.splitext(data_model_path) + prefix = data_model_path_root + prefix_root, prefix_ext = os.path.splitext(prefix) + if prefix_ext == ".model": + prefix = prefix_root + json_schema_log_file_name = f"{prefix}.{source_node}.schema.json" + return json_schema_log_file_name From bff2f9072d6225ef7a943102335ccdffb7509f2b Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Mon, 22 Apr 2024 13:52:35 -0700 Subject: [PATCH 018/110] change parameter name to get_json_schema_log_file_path --- schematic/schemas/data_model_json_schema.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/schematic/schemas/data_model_json_schema.py b/schematic/schemas/data_model_json_schema.py index 1b6c0e5ad..55467a08a 100644 --- a/schematic/schemas/data_model_json_schema.py +++ b/schematic/schemas/data_model_json_schema.py @@ -10,7 +10,7 @@ from schematic.schemas.data_model_graph import DataModelGraphExplorer from schematic.schemas.data_model_relationships import DataModelRelationships from schematic.utils.validate_utils import rule_in_rule_list -from schematic.utils.schema_utils import get_json_schema_log_file_name +from schematic.utils.schema_utils import get_json_schema_log_file_path logger = logging.getLogger(__name__) @@ -393,16 +393,16 @@ def get_json_validation_schema( # If no config value and SchemaGenerator was initialized with # a JSON-LD path, construct if self.jsonld_path is not None: - json_schema_log_file_name = get_json_schema_log_file_name(data_model_path=self.jsonld_path, source_node=source_node) - if json_schema_log_file_name is None: + json_schema_log_file_path = get_json_schema_log_file_path(data_model_path=self.jsonld_path, source_node=source_node) + if json_schema_log_file_path is None: logger.info( "The JSON schema file can be inspected by setting the following " "nested key in the configuration: (model > location)." ) else: - json_schema_dirname = os.path.dirname(json_schema_log_file_name) + json_schema_dirname = os.path.dirname(json_schema_log_file_path) if json_schema_dirname != "": os.makedirs(json_schema_dirname, exist_ok=True) - with open(json_schema_log_file_name, "w", encoding="UTF-8") as js_f: + with open(json_schema_log_file_path, "w", encoding="UTF-8") as js_f: json.dump(json_schema, js_f, indent=2) return json_schema From 0dedc003ca6ac86a2fd2156fe4d43ad7984c1523 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Mon, 22 Apr 2024 13:53:02 -0700 Subject: [PATCH 019/110] update param name in schema_utils too --- schematic/utils/schema_utils.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/schematic/utils/schema_utils.py b/schematic/utils/schema_utils.py index 61a857e30..a83917dbf 100644 --- a/schematic/utils/schema_utils.py +++ b/schematic/utils/schema_utils.py @@ -484,11 +484,18 @@ def strip_context(context_value: str) -> tuple[str, str]: context, value = context_value.split("@") return context, value -def get_json_schema_log_file_name(data_model_path: str, source_node: str) -> str: +def get_json_schema_log_file_path(data_model_path: str, source_node: str) -> str: + """ Get json schema log file name from the data_mdoel_path + Args: + data_model_path: str, path to the data model + source_node: str, root node to create the JSON schema for + Returns: + json_schema_log_file_path: str, file name for the log file + """ data_model_path_root, _ = os.path.splitext(data_model_path) prefix = data_model_path_root prefix_root, prefix_ext = os.path.splitext(prefix) if prefix_ext == ".model": prefix = prefix_root - json_schema_log_file_name = f"{prefix}.{source_node}.schema.json" - return json_schema_log_file_name + json_schema_log_file_path = f"{prefix}.{source_node}.schema.json" + return json_schema_log_file_path From 00d7d68911159d3fdbc3651f9e48a05a59bfb119 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Mon, 22 Apr 2024 13:54:31 -0700 Subject: [PATCH 020/110] test that the file can be made properly --- tests/test_schemas.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/tests/test_schemas.py b/tests/test_schemas.py index f54e82351..0b8114da1 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -18,7 +18,8 @@ get_attribute_display_name_from_label, convert_bool_to_str, parse_validation_rules, - DisplayLabelType + DisplayLabelType, + get_json_schema_log_file_path ) from schematic.utils.io_utils import load_json @@ -877,10 +878,7 @@ def test_generate_node_dict( # Check that the display name matches the label if data_model_labels == "display_label": - try: - assert node_display_name == node_dict["label"] - except: - breakpoint() + assert node_display_name == node_dict["label"] def test_generate_node(self, helpers, data_model): # Test adding a dummy node @@ -1186,6 +1184,7 @@ def test_get_non_blank_schema(self, helpers, data_model, node_name): @pytest.mark.parametrize("blank", [True, False], ids=["True_blank", "False_blank"]) def test_get_range_schema(self, helpers, data_model, node_range, node_name, blank): dmjs = get_data_model_json_schema(helpers=helpers, data_model_name=data_model) + range_schema = dmjs.get_range_schema( node_range=node_range, node_name=node_name, blank=blank ) @@ -1214,6 +1213,16 @@ def test_get_json_validation_schema( ): dmjs = get_data_model_json_schema(helpers=helpers, data_model_name=data_model) + data_model_path = helpers.get_data_path(path=data_model) + json_schema_log_file_path = get_json_schema_log_file_path( + data_model_path=data_model_path, + source_node=source_node) + + # Remove json schema log file if it already exists. + if os.path.exists(json_schema_log_file_path): + os.remove(json_schema_log_file_path) + assert os.path.exists(json_schema_log_file_path) == False + try: # Get validation schema json_validation_schema = dmjs.get_json_validation_schema( @@ -1239,6 +1248,13 @@ def test_get_json_validation_schema( # Check contents of validation schema assert "Diagnosis" in json_validation_schema["properties"] assert "Cancer" in json_validation_schema["properties"]["Diagnosis"]["enum"] + + # Check that log file is saved + assert os.path.exists(json_schema_log_file_path) == True + + # Remove the log file that was created. + os.remove(json_schema_log_file_path) + except: # Should only fail if no source node is provided. assert source_node == "" From ac359a2964af533ef1d144e16963872e0f780f69 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Mon, 22 Apr 2024 19:13:22 -0700 Subject: [PATCH 021/110] add test for get_json_schema_log_file_path --- tests/test_utils.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index 2bf23d6fa..a5d14462e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -65,6 +65,7 @@ parse_validation_rules, extract_component_validation_rules, check_for_duplicate_components, + get_json_schema_log_file_path, ) @@ -161,6 +162,8 @@ "bio_things": {"class": "BioThings", "property": "bioThings"}, } +DATA_MODEL_DICT = {"example.model.csv": "CSV", "example.model.jsonld": "JSONLD"} + test_disk_storage = [ (2, 4000, 16000), (1000, 4000, 16000), @@ -988,6 +991,27 @@ def test_get_label_from_display_name(self, test_dn: str, data_model_labels: str) return return + @pytest.mark.parametrize( + "data_model", + list(DATA_MODEL_DICT.keys()), + ids=list(DATA_MODEL_DICT.values()) + ) + @pytest.mark.parametrize( + "source_node", + ["Biospecimen", "Patient"], + ids=["biospecimen_source", "patient_source"], + ) + def test_get_json_schema_log_file_path(self, helpers, data_model:str, source_node: str): + data_model_path = helpers.get_data_path(path=data_model) + json_schema_log_file_path = get_json_schema_log_file_path( + data_model_path=data_model_path, + source_node=source_node) + + # Check that model is not included in the json_schema_log_file_path + assert '.model' not in "data_model" + + # Check the file suffixs are what is expected. + assert ['schema', 'json'] == json_schema_log_file_path.split('.')[-2:] class TestValidateUtils: def test_validate_schema(self, helpers): From d8947c67eb9235279a886b9fa801c087e3dfc142 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Mon, 22 Apr 2024 19:15:49 -0700 Subject: [PATCH 022/110] run black --- schematic/schemas/data_model_json_schema.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/schematic/schemas/data_model_json_schema.py b/schematic/schemas/data_model_json_schema.py index 55467a08a..1bafd7b90 100644 --- a/schematic/schemas/data_model_json_schema.py +++ b/schematic/schemas/data_model_json_schema.py @@ -393,7 +393,9 @@ def get_json_validation_schema( # If no config value and SchemaGenerator was initialized with # a JSON-LD path, construct if self.jsonld_path is not None: - json_schema_log_file_path = get_json_schema_log_file_path(data_model_path=self.jsonld_path, source_node=source_node) + json_schema_log_file_path = get_json_schema_log_file_path( + data_model_path=self.jsonld_path, source_node=source_node + ) if json_schema_log_file_path is None: logger.info( "The JSON schema file can be inspected by setting the following " From 7e5144ce1311fc43ddbc3255ed74a86761b228a4 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Mon, 22 Apr 2024 19:23:16 -0700 Subject: [PATCH 023/110] run black on schema_utils --- schematic/utils/schema_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/schematic/utils/schema_utils.py b/schematic/utils/schema_utils.py index a83917dbf..bbf34940f 100644 --- a/schematic/utils/schema_utils.py +++ b/schematic/utils/schema_utils.py @@ -484,8 +484,9 @@ def strip_context(context_value: str) -> tuple[str, str]: context, value = context_value.split("@") return context, value + def get_json_schema_log_file_path(data_model_path: str, source_node: str) -> str: - """ Get json schema log file name from the data_mdoel_path + """Get json schema log file name from the data_mdoel_path Args: data_model_path: str, path to the data model source_node: str, root node to create the JSON schema for From 98e8e6b84b49f5b96daa0730519b3009d94d84e4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 30 Apr 2024 12:04:23 -0700 Subject: [PATCH 024/110] added pydantic to toml file --- poetry.lock | 13 ++++++++++++- pyproject.toml | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 7314c46cb..3d1de5b92 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3295,6 +3295,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -3302,8 +3303,16 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -3320,6 +3329,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -3327,6 +3337,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -4807,4 +4818,4 @@ aws = ["uWSGI"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.11" -content-hash = "a5d0f2dd9e8e4346048dd8f2cd798a092d375714a5a3e1c7d86cfaf23d6e96fc" +content-hash = "798fe0f7a2436eea45112b9d14c269388ec7a5d8f0e3cd70def839199321adc2" diff --git a/pyproject.toml b/pyproject.toml index 7c2bb991c..c4ced5a65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,7 @@ schematic-db = {version = "0.0.41", extras = ["synapse"]} pyopenssl = {version = "^23.0.0", optional = true} typing-extensions = "<4.6.0" dataclasses-json = "^0.6.1" +pydantic = "^1.10.4" connexion = {extras = ["swagger-ui"], version = "^2.8.0", optional = true} Flask = {version = "2.1.3", optional = true} Flask-Cors = {version = "^3.0.10", optional = true} From 214b9e03b4c0b331dc5ec3acfe2c83f8e5375e14 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 3 May 2024 09:35:14 -0700 Subject: [PATCH 025/110] update pandas and numpy --- poetry.lock | 75 +++++++++++++++++++++++++++++--------------------- pyproject.toml | 4 +-- 2 files changed, 45 insertions(+), 34 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7314c46cb..9d3eb0776 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2605,44 +2605,44 @@ doc = ["mkdocs-material"] [[package]] name = "pandas" -version = "2.2.1" +version = "2.2.2" description = "Powerful data structures for data analysis, time series, and statistics" optional = false python-versions = ">=3.9" files = [ - {file = "pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8df8612be9cd1c7797c93e1c5df861b2ddda0b48b08f2c3eaa0702cf88fb5f88"}, - {file = "pandas-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0f573ab277252ed9aaf38240f3b54cfc90fff8e5cab70411ee1d03f5d51f3944"}, - {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f02a3a6c83df4026e55b63c1f06476c9aa3ed6af3d89b4f04ea656ccdaaaa359"}, - {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c38ce92cb22a4bea4e3929429aa1067a454dcc9c335799af93ba9be21b6beb51"}, - {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c2ce852e1cf2509a69e98358e8458775f89599566ac3775e70419b98615f4b06"}, - {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53680dc9b2519cbf609c62db3ed7c0b499077c7fefda564e330286e619ff0dd9"}, - {file = "pandas-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:94e714a1cca63e4f5939cdce5f29ba8d415d85166be3441165edd427dc9f6bc0"}, - {file = "pandas-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f821213d48f4ab353d20ebc24e4faf94ba40d76680642fb7ce2ea31a3ad94f9b"}, - {file = "pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c70e00c2d894cb230e5c15e4b1e1e6b2b478e09cf27cc593a11ef955b9ecc81a"}, - {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e97fbb5387c69209f134893abc788a6486dbf2f9e511070ca05eed4b930b1b02"}, - {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101d0eb9c5361aa0146f500773395a03839a5e6ecde4d4b6ced88b7e5a1a6403"}, - {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7d2ed41c319c9fb4fd454fe25372028dfa417aacb9790f68171b2e3f06eae8cd"}, - {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5d3c00557d657c8773ef9ee702c61dd13b9d7426794c9dfeb1dc4a0bf0ebc7"}, - {file = "pandas-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:06cf591dbaefb6da9de8472535b185cba556d0ce2e6ed28e21d919704fef1a9e"}, - {file = "pandas-2.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:88ecb5c01bb9ca927ebc4098136038519aa5d66b44671861ffab754cae75102c"}, - {file = "pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee"}, - {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a935a90a76c44fe170d01e90a3594beef9e9a6220021acfb26053d01426f7dc2"}, - {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c391f594aae2fd9f679d419e9a4d5ba4bce5bb13f6a989195656e7dc4b95c8f0"}, - {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9d1265545f579edf3f8f0cb6f89f234f5e44ba725a34d86535b1a1d38decbccc"}, - {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11940e9e3056576ac3244baef2fedade891977bcc1cb7e5cc8f8cc7d603edc89"}, - {file = "pandas-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4acf681325ee1c7f950d058b05a820441075b0dd9a2adf5c4835b9bc056bf4fb"}, - {file = "pandas-2.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9bd8a40f47080825af4317d0340c656744f2bfdb6819f818e6ba3cd24c0e1397"}, - {file = "pandas-2.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:df0c37ebd19e11d089ceba66eba59a168242fc6b7155cba4ffffa6eccdfb8f16"}, - {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:739cc70eaf17d57608639e74d63387b0d8594ce02f69e7a0b046f117974b3019"}, - {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9d3558d263073ed95e46f4650becff0c5e1ffe0fc3a015de3c79283dfbdb3df"}, - {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4aa1d8707812a658debf03824016bf5ea0d516afdea29b7dc14cf687bc4d4ec6"}, - {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:76f27a809cda87e07f192f001d11adc2b930e93a2b0c4a236fde5429527423be"}, - {file = "pandas-2.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:1ba21b1d5c0e43416218db63037dbe1a01fc101dc6e6024bcad08123e48004ab"}, - {file = "pandas-2.2.1.tar.gz", hash = "sha256:0ab90f87093c13f3e8fa45b48ba9f39181046e8f3317d3aadb2fffbb1b978572"}, + {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"}, + {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"}, + {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"}, + {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"}, + {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"}, + {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"}, + {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"}, + {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"}, + {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"}, + {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"}, + {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57"}, + {file = "pandas-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4"}, + {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"}, ] [package.dependencies] -numpy = {version = ">=1.22.4,<2", markers = "python_version < \"3.11\""} +numpy = {version = ">=1.22.4", markers = "python_version < \"3.11\""} python-dateutil = ">=2.8.2" pytz = ">=2020.1" tzdata = ">=2022.7" @@ -3295,6 +3295,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -3302,8 +3303,16 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -3320,6 +3329,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -3327,6 +3337,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -4807,4 +4818,4 @@ aws = ["uWSGI"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.11" -content-hash = "a5d0f2dd9e8e4346048dd8f2cd798a092d375714a5a3e1c7d86cfaf23d6e96fc" +content-hash = "2291eef938a4055ec847dccf5e538406c1bd453c49dd6a60e2c81801a3214714" diff --git a/pyproject.toml b/pyproject.toml index 7c2bb991c..41d7a4766 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,9 +47,9 @@ graphviz = "^0.20.0" inflection = "^0.5.1" jsonschema = "^4.0.0" networkx = ">=2.2.8" -numpy = "^1.21.1" +numpy = "^1.26.4" oauth2client = "^4.1.0" # Specified because of bug in version ^4.0.0 -pandas = "^2.0.0" +pandas = "^2.2.2" pygsheets = "^2.0.4" PyYAML = "^6.0.0" rdflib = "^6.0.0" From 1f7a8c63e65367b032097e1a76ee6b3089fa0d4e Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 8 May 2024 11:01:33 -0400 Subject: [PATCH 026/110] add jaeger-client and flask open tracing --- poetry.lock | 86 ++++++++++++++++++++++++++++++++++++++++++++++++-- pyproject.toml | 4 ++- 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7314c46cb..4d23ce634 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1033,6 +1033,24 @@ files = [ Flask = ">=0.9" Six = "*" +[[package]] +name = "flask-opentracing" +version = "2.0.0" +description = "OpenTracing support for Flask applications" +optional = true +python-versions = "*" +files = [ + {file = "Flask-OpenTracing-2.0.0.tar.gz", hash = "sha256:4de9db3d4f0d2b506ce3874fc721278d41b2e8b0125ea567164be0100df502fe"}, + {file = "Flask_OpenTracing-2.0.0-py3-none-any.whl", hash = "sha256:e7086ffb3531a518c6e3bf2b365af4a51e56a0922fdd5ebe91c9ddeeda632e70"}, +] + +[package.dependencies] +Flask = "*" +opentracing = ">=2.0,<3" + +[package.extras] +tests = ["flake8", "flake8-quotes", "mock", "pytest", "pytest-cov", "tox"] + [[package]] name = "fqdn" version = "1.5.1" @@ -1634,6 +1652,25 @@ files = [ {file = "itsdangerous-2.1.2.tar.gz", hash = "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a"}, ] +[[package]] +name = "jaeger-client" +version = "4.8.0" +description = "Jaeger Python OpenTracing Tracer implementation" +optional = true +python-versions = ">=3.7" +files = [ + {file = "jaeger-client-4.8.0.tar.gz", hash = "sha256:3157836edab8e2c209bd2d6ae61113db36f7ee399e66b1dcbb715d87ab49bfe0"}, +] + +[package.dependencies] +opentracing = ">=2.1,<3.0" +threadloop = ">=1,<2" +thrift = "*" +tornado = ">=4.3" + +[package.extras] +tests = ["codecov", "coverage", "flake8", "flake8-quotes", "flake8-typing-imports", "mock", "mypy", "opentracing_instrumentation (>=3,<4)", "prometheus_client (==0.11.0)", "pycurl", "pytest", "pytest-benchmark[histogram]", "pytest-cov", "pytest-localserver", "pytest-timeout", "pytest-tornado", "tchannel (==2.1.0)"] + [[package]] name = "jedi" version = "0.19.1" @@ -2562,6 +2599,19 @@ files = [ {file = "opentelemetry_semantic_conventions-0.42b0.tar.gz", hash = "sha256:44ae67a0a3252a05072877857e5cc1242c98d4cf12870159f1a94bec800d38ec"}, ] +[[package]] +name = "opentracing" +version = "2.4.0" +description = "OpenTracing API for Python. See documentation at http://opentracing.io" +optional = true +python-versions = "*" +files = [ + {file = "opentracing-2.4.0.tar.gz", hash = "sha256:a173117e6ef580d55874734d1fa7ecb6f3655160b8b8974a2a1e98e5ec9c840d"}, +] + +[package.extras] +tests = ["Sphinx", "doubles", "flake8", "flake8-quotes", "gevent", "mock", "pytest", "pytest-cov", "pytest-mock", "six (>=1.10.0,<2.0)", "sphinx_rtd_theme", "tornado"] + [[package]] name = "overrides" version = "7.7.0" @@ -4356,6 +4406,38 @@ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["pre-commit", "pytest (>=7.0)", "pytest-timeout"] typing = ["mypy (>=1.6,<2.0)", "traitlets (>=5.11.1)"] +[[package]] +name = "threadloop" +version = "1.0.2" +description = "Tornado IOLoop Backed Concurrent Futures" +optional = true +python-versions = "*" +files = [ + {file = "threadloop-1.0.2-py2-none-any.whl", hash = "sha256:5c90dbefab6ffbdba26afb4829d2a9df8275d13ac7dc58dccb0e279992679599"}, + {file = "threadloop-1.0.2.tar.gz", hash = "sha256:8b180aac31013de13c2ad5c834819771992d350267bddb854613ae77ef571944"}, +] + +[package.dependencies] +tornado = "*" + +[[package]] +name = "thrift" +version = "0.20.0" +description = "Python bindings for the Apache Thrift RPC system" +optional = true +python-versions = "*" +files = [ + {file = "thrift-0.20.0.tar.gz", hash = "sha256:4dd662eadf6b8aebe8a41729527bd69adf6ceaa2a8681cbef64d1273b3e8feba"}, +] + +[package.dependencies] +six = ">=1.7.2" + +[package.extras] +all = ["tornado (>=4.0)", "twisted"] +tornado = ["tornado (>=4.0)"] +twisted = ["twisted"] + [[package]] name = "tinycss2" version = "1.2.1" @@ -4801,10 +4883,10 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] [extras] -api = ["Flask", "Flask-Cors", "Jinja2", "connexion", "pyopenssl"] +api = ["Flask", "Flask-Cors", "Jinja2", "connexion", "flask-opentracing", "jaeger-client", "pyopenssl"] aws = ["uWSGI"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.11" -content-hash = "a5d0f2dd9e8e4346048dd8f2cd798a092d375714a5a3e1c7d86cfaf23d6e96fc" +content-hash = "e26186e694bba121eca9a26243fd2498fe11297ae77d431522e7c9b0d96ba979" diff --git a/pyproject.toml b/pyproject.toml index 7c2bb991c..da4f71e61 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,9 +74,11 @@ Flask = {version = "2.1.3", optional = true} Flask-Cors = {version = "^3.0.10", optional = true} uWSGI = {version = "^2.0.21", optional = true} Jinja2 = {version = ">2.11.3", optional = true} +jaeger-client = {version = "^4.8.0", optional = true} +flask-opentracing = {version="^2.0.0", optional = true} [tool.poetry.extras] -api = ["connexion", "Flask", "Flask-Cors", "Jinja2", "pyopenssl"] +api = ["connexion", "Flask", "Flask-Cors", "Jinja2", "pyopenssl", "jaeger-client", "flask-opentracing"] aws = ["uWSGI"] From f955a25bd59fff6595d96ba471631b4d7d6628cf Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 8 May 2024 11:44:52 -0400 Subject: [PATCH 027/110] add tracer for /manifest/generate --- schematic_api/api/routes.py | 98 +++++++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 36 deletions(-) diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index fbf36fbf5..d5cf417a7 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -40,10 +40,24 @@ ) from schematic.utils.general import entity_type_mapping from schematic.utils.schema_utils import get_property_label_from_display_name, DisplayLabelType +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.resources import SERVICE_NAME, Resource +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) +trace.set_tracer_provider( + TracerProvider( + resource=Resource(attributes={SERVICE_NAME: "schematic"}) + ) +) +trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) +tracer = trace.get_tracer("schematic-api") + + def config_handler(asset_view: str = None): # check if path to config is provided @@ -283,43 +297,55 @@ def get_manifest_route( Returns: Googlesheet URL (if sheet_url is True), or pandas dataframe (if sheet_url is False). """ + with tracer.start_as_current_span('generate-manifest-route') as span: + span.set_attribute('schema_url', schema_url) + span.set_attribute('use_annotations', use_annotations) + span.set_attribute('dataset_id', dataset_id) + span.set_attribute('asset_view', asset_view) + span.set_attribute('output_format', output_format) + span.set_attribute('title', title) + span.set_attribute('strict_validation', strict_validation) + span.set_attribute('data_model_labels', data_model_labels) + span.set_attribute('data_type', data_type) + + + + # Get access token from request header + access_token = get_access_token() + + config_handler(asset_view=asset_view) + + all_results = ManifestGenerator.create_manifests( + path_to_data_model=schema_url, + output_format=output_format, + data_types=data_type, + title=title, + access_token=access_token, + dataset_ids=dataset_id, + strict=strict_validation, + use_annotations=use_annotations, + data_model_labels=data_model_labels, + ) - # Get access token from request header - access_token = get_access_token() - - config_handler(asset_view=asset_view) - - all_results = ManifestGenerator.create_manifests( - path_to_data_model=schema_url, - output_format=output_format, - data_types=data_type, - title=title, - access_token=access_token, - dataset_ids=dataset_id, - strict=strict_validation, - use_annotations=use_annotations, - data_model_labels=data_model_labels, - ) - - # return an excel file if output_format is set to "excel" - if output_format == "excel": - # should only contain one excel spreadsheet path - if len(all_results) > 0: - result = all_results[0] - dir_name = os.path.dirname(result) - file_name = os.path.basename(result) - mimetype = ( - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - ) - return send_from_directory( - directory=dir_name, - path=file_name, - as_attachment=True, - mimetype=mimetype, - max_age=0, - ) - - return all_results + # return an excel file if output_format is set to "excel" + if output_format == "excel": + # should only contain one excel spreadsheet path + if len(all_results) > 0: + result = all_results[0] + dir_name = os.path.dirname(result) + file_name = os.path.basename(result) + mimetype = ( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + return send_from_directory( + directory=dir_name, + path=file_name, + as_attachment=True, + mimetype=mimetype, + max_age=0, + ) + + return all_results #####profile validate manifest route function From b62748305e5b58a6af8a0a8afe34c0789c85f135 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 9 May 2024 17:04:29 -0400 Subject: [PATCH 028/110] update tests to use indirect and fixture properly --- tests/conftest.py | 41 ++++++++--------------------------------- tests/test_store.py | 28 +++++++++------------------- 2 files changed, 17 insertions(+), 52 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e093e3ad9..8d73650ef 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -132,42 +132,17 @@ def synapse_store(request): # end of the test @pytest.fixture(scope="function") -def test_bulkrnaseq(helpers: Helpers) -> Generator[str, None, None]: - """create temporary copy of test_BulkRNAseq.csv - This fixture creates a temporary copy of the original 'test_BulkRNAseq.csv' file - After test, the copied file is removed. - Args: - helpers (Helpers): Helpers fixture - - Yields: - Generator[Path, None, None]: temporary file path of the copied version test_BulkRNAseq.csv - """ - # original bulkrnaseq csv - original_test_path = helpers.get_data_path("mock_manifests/test_BulkRNAseq.csv") +def temporary_file_copy(request, helpers: Helpers) -> Generator[str, None, None]: + file_name = request.param + # original file copy + original_test_path = helpers.get_data_path(f"mock_manifests/{file_name}") + # get filename without extension + file_name_no_extension=file_name.split(".")[0] # Copy the original CSV file to a temporary directory - temp_csv_path = helpers.get_data_path("mock_manifests/test_BulkRNAseq2.csv") + temp_csv_path = helpers.get_data_path(f"mock_manifests/{file_name_no_extension}_copy.csv") + shutil.copyfile(original_test_path, temp_csv_path) yield temp_csv_path # Teardown if os.path.exists(temp_csv_path): os.remove(temp_csv_path) - - -@pytest.fixture(scope="function") -def test_annotations_manifest(helpers: Helpers) -> Generator[str, None, None]: - """ - Create temporary copy of annotations_test_manifest.csv - This fixture creates a temporary copy of the original 'test_BulkRNAseq.csv' file - After test, the copied file is removed. - Args: - helpers (Helpers): Helpers fixture - - Yields: - Generator[Path, None, None]: temporary file path of the copied manifest - """ - original_test_path = helpers.get_data_path("mock_manifests/annotations_test_manifest.csv") - temp_csv_path = helpers.get_data_path("mock_manifests/annotations_test_manifest2.csv") - shutil.copyfile(original_test_path, temp_csv_path) - yield temp_csv_path - if os.path.exists(temp_csv_path): - os.remove(temp_csv_path) diff --git a/tests/test_store.py b/tests/test_store.py index 4b1ae6eb3..06fa4bf23 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -214,45 +214,38 @@ def test_get_file_entityIds(self, helpers, synapse_store, only_new_files): assert len(files_and_Ids["entityId"]) == 2 @pytest.mark.parametrize( - "manifest, test_annotations, dataset_id, manifest_record_type", + "test_annotations, dataset_id, manifest_record_type, temporary_file_copy", [ ( - "annotations_manifest", {"CheckInt": "7", "CheckList": "valid, list, values"}, "syn34295552", "file_and_entities", + "annotations_test_manifest.csv" ), ( - "bulk_rna_seq_manifest", {"FileFormat": "BAM", "GenomeBuild": "GRCh38"}, "syn39241199", "table_and_file", + "test_BulkRNAseq.csv" ), ], ids=["non file-based", "file-based"], + indirect=["temporary_file_copy"] ) def test_annotation_submission( self, synapse_store: SynapseStorage, helpers, - manifest: str, test_annotations: dict[str, str], dataset_id: str, manifest_record_type: str, dmge: DataModelGraphExplorer, - test_bulkrnaseq: str, - test_annotations_manifest: str, + temporary_file_copy: Generator[str, None, None], ): """Test annotation submission""" - if manifest == "annotations_manifest": - copy_path = test_annotations_manifest - else: - copy_path = test_bulkrnaseq - - synapse_store.associateMetadataWithFiles( dmge=dmge, - metadataManifestPath=copy_path, + metadataManifestPath=temporary_file_copy, datasetId=dataset_id, manifest_record_type=manifest_record_type, hideBlanks=True, @@ -260,20 +253,17 @@ def test_annotation_submission( ) # Retrive annotations - entity_id = helpers.get_data_frame(copy_path)["entityId"][0] + entity_id = helpers.get_data_frame(temporary_file_copy)["entityId"][0] annotations = synapse_store.getFileAnnotations(entity_id) - # remove file copy - os.remove(copy_path) - # Check annotations of interest for key in test_annotations.keys(): assert key in annotations.keys() assert annotations[key] == test_annotations[key] - if copy_path.endswith("annotations_test_manifest.csv"): + if temporary_file_copy.endswith("annotations_test_manifest_copy.csv"): assert "CheckRecommended" not in annotations.keys() - elif copy_path.endswith("test_BulkRNAseq.csv"): + elif temporary_file_copy.endswith("test_BulkRNAseq_copy.csv"): entity = synapse_store.syn.get(entity_id) assert isinstance(entity, File) From f06b19a9716984d90fd49e891460e366fdc29a22 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 9 May 2024 17:29:15 -0400 Subject: [PATCH 029/110] update to use different fixture --- tests/test_metadata.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 45e4dd711..69c1324fd 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -2,7 +2,7 @@ import logging import os -from typing import Optional +from typing import Optional, Generator from pathlib import Path from unittest.mock import patch @@ -109,9 +109,10 @@ def test_populate_manifest(self, helpers, return_excel, data_model_labels): ids=["data_model_labels-display_label", "data_model_labels-class_label"], ) @pytest.mark.parametrize("validate_component", [None, "BulkRNA-seqAssay"]) + @pytest.mark.parametrize("temporary_file_copy", ["test_BulkRNAseq.csv"], indirect=True) def test_submit_metadata_manifest( self, - test_bulkrnaseq: Path, + temporary_file_copy: Generator[str, None, None], helpers: Helpers, file_annotations_upload: bool, restrict_rules: bool, @@ -128,7 +129,7 @@ def test_submit_metadata_manifest( "schematic.store.synapse.SynapseStorage.associateMetadataWithFiles", return_value="mock manifest id", ): - mock_manifest_path = test_bulkrnaseq + mock_manifest_path = temporary_file_copy data_model_jsonld = helpers.get_data_path("example.model.jsonld") mock_manifest_id = meta_data_model.submit_metadata_manifest( manifest_path=mock_manifest_path, From add00c009e252ec11babb49cc5397f7052862436 Mon Sep 17 00:00:00 2001 From: linglp Date: Fri, 10 May 2024 12:26:51 -0400 Subject: [PATCH 030/110] add decorator --- schematic_api/api/routes.py | 103 +++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 48 deletions(-) diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index d5cf417a7..0855db7af 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -20,6 +20,7 @@ import pandas as pd import json from typing import Optional +from functools import wraps from schematic.configuration.configuration import CONFIG from schematic.visualization.attributes_explorer import AttributesExplorer @@ -46,6 +47,7 @@ from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter + logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) @@ -58,6 +60,23 @@ tracer = trace.get_tracer("schematic-api") +def trace_function_params(): + def decorator(func): + @wraps(func) + def wrapper(**kwargs): + tracer = trace.get_tracer(__name__) + # Start a new span with the function's name + with tracer.start_as_current_span(func.__name__) as span: + # Set values of parameters as tags + for name, value in kwargs.items(): + span.set_attribute(name, value) + # Call the actual function + result = func(**kwargs) + return result + return wrapper + return decorator + + def config_handler(asset_view: str = None): # check if path to config is provided @@ -273,6 +292,7 @@ def get_temp_model_path(schema_url): # @before_request +@trace_function_params() def get_manifest_route( schema_url: str, use_annotations: bool, @@ -297,55 +317,42 @@ def get_manifest_route( Returns: Googlesheet URL (if sheet_url is True), or pandas dataframe (if sheet_url is False). """ - with tracer.start_as_current_span('generate-manifest-route') as span: - span.set_attribute('schema_url', schema_url) - span.set_attribute('use_annotations', use_annotations) - span.set_attribute('dataset_id', dataset_id) - span.set_attribute('asset_view', asset_view) - span.set_attribute('output_format', output_format) - span.set_attribute('title', title) - span.set_attribute('strict_validation', strict_validation) - span.set_attribute('data_model_labels', data_model_labels) - span.set_attribute('data_type', data_type) - - - - # Get access token from request header - access_token = get_access_token() - - config_handler(asset_view=asset_view) - - all_results = ManifestGenerator.create_manifests( - path_to_data_model=schema_url, - output_format=output_format, - data_types=data_type, - title=title, - access_token=access_token, - dataset_ids=dataset_id, - strict=strict_validation, - use_annotations=use_annotations, - data_model_labels=data_model_labels, - ) + # Get access token from request header + access_token = get_access_token() + + config_handler(asset_view=asset_view) + + all_results = ManifestGenerator.create_manifests( + path_to_data_model=schema_url, + output_format=output_format, + data_types=data_type, + title=title, + access_token=access_token, + dataset_ids=dataset_id, + strict=strict_validation, + use_annotations=use_annotations, + data_model_labels=data_model_labels, + ) - # return an excel file if output_format is set to "excel" - if output_format == "excel": - # should only contain one excel spreadsheet path - if len(all_results) > 0: - result = all_results[0] - dir_name = os.path.dirname(result) - file_name = os.path.basename(result) - mimetype = ( - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - ) - return send_from_directory( - directory=dir_name, - path=file_name, - as_attachment=True, - mimetype=mimetype, - max_age=0, - ) - - return all_results + # return an excel file if output_format is set to "excel" + if output_format == "excel": + # should only contain one excel spreadsheet path + if len(all_results) > 0: + result = all_results[0] + dir_name = os.path.dirname(result) + file_name = os.path.basename(result) + mimetype = ( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + return send_from_directory( + directory=dir_name, + path=file_name, + as_attachment=True, + mimetype=mimetype, + max_age=0, + ) + + return all_results #####profile validate manifest route function From bee84acd8c06198e74dd862bad066dbbfbf1c296 Mon Sep 17 00:00:00 2001 From: linglp Date: Fri, 10 May 2024 17:22:58 -0400 Subject: [PATCH 031/110] add tracers --- schematic/manifest/generator.py | 10 ++++++++++ schematic/schemas/data_model_parser.py | 8 +++++++- schematic/store/synapse.py | 19 +++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/schematic/manifest/generator.py b/schematic/manifest/generator.py index 5aced191e..d29c95f13 100644 --- a/schematic/manifest/generator.py +++ b/schematic/manifest/generator.py @@ -32,8 +32,10 @@ from schematic.configuration.configuration import CONFIG from schematic.utils.google_api_utils import export_manifest_drive_service +from opentelemetry import trace logger = logging.getLogger(__name__) +tracer = trace.get_tracer("generator::ManifestGenerator") class ManifestGenerator(object): @@ -1289,6 +1291,7 @@ def _gather_all_fields(self, fields, json_schema): ) return required_metadata_fields + @tracer.start_as_current_span("ManifestGenerator::get_empty_manifest") def get_empty_manifest( self, strict: Optional[bool], @@ -1334,6 +1337,7 @@ def _get_missing_columns(self, headers_1: list, headers_2: list) -> list: """ return set(headers_1) - set(headers_2) + @tracer.start_as_current_span("ManifestGenerator::set_dataframe_by_url") def set_dataframe_by_url( self, manifest_url: str, @@ -1425,6 +1429,7 @@ def map_annotation_names_to_display_names( # Use the above dictionary to rename columns in question return annotations.rename(columns=label_map) + @tracer.start_as_current_span("ManifestGenerator::get_manifest_with_annotations") def get_manifest_with_annotations( self, annotations: pd.DataFrame, strict: Optional[bool] = None ) -> Tuple[ps.Spreadsheet, pd.DataFrame]: @@ -1465,6 +1470,7 @@ def get_manifest_with_annotations( return manifest_url, manifest_df + @tracer.start_as_current_span("ManifestGenerator::export_sheet_to_excel") def export_sheet_to_excel( self, title: str = None, manifest_url: str = None, output_location: str = None ) -> str: @@ -1514,6 +1520,7 @@ def export_sheet_to_excel( return output_excel_file_path + @tracer.start_as_current_span("ManifestGenerator::_handle_output_format_logic") def _handle_output_format_logic( self, output_format: str = None, @@ -1998,6 +2005,9 @@ def _format_new_excel_column(self, worksheet, new_column_index: int, col: str): ) return worksheet + @tracer.start_as_current_span( + "ManifestGenerator::populate_existing_excel_spreadsheet" + ) def populate_existing_excel_spreadsheet( self, existing_excel_path: str = None, additional_df: pd.DataFrame = None ): diff --git a/schematic/schemas/data_model_parser.py b/schematic/schemas/data_model_parser.py index 2ddda42bd..892f87d97 100644 --- a/schematic/schemas/data_model_parser.py +++ b/schematic/schemas/data_model_parser.py @@ -13,8 +13,11 @@ from schematic.schemas.data_model_relationships import DataModelRelationships from schematic import LOADER +from opentelemetry import trace -logger = logging.getLogger("Synapse storage") +logger = logging.getLogger("Schemas") + +tracer = trace.get_tracer("Schemas::DataModelParser") class DataModelParser: @@ -84,6 +87,7 @@ def parse_base_model(self) -> dict: base_model = jsonld_parser.parse_jsonld_model(base_model_path) return base_model + @tracer.start_as_current_span("DataModelParser::parse_model") def parse_model(self) -> dict[str, dict[str, Any]]: """Given a data model type, instantiate and call the appropriate data model parser. Returns: @@ -230,6 +234,7 @@ def gather_csv_attributes_relationships( ) return attr_rel_dictionary + @tracer.start_as_current_span("Schemas::DataModelCSVParser::parse_csv_model") def parse_csv_model( self, path_to_data_model: str, @@ -527,6 +532,7 @@ def gather_jsonld_attributes_relationships(self, model_jsonld: list[dict]) -> di ) return attr_rel_dictionary + @tracer.start_as_current_span("Schemas::DataModelJSONLDParser::parse_jsonld_model") def parse_jsonld_model( self, path_to_data_model: str, diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index a15137ae8..d078d45d4 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -68,9 +68,12 @@ from schematic.store.base import BaseStorage from schematic.exceptions import AccessCredentialsError from schematic.configuration.configuration import CONFIG +from opentelemetry import trace logger = logging.getLogger("Synapse storage") +tracer = trace.get_tracer("store:SynapseStorage") + @dataclass class ManifestDownload(object): @@ -248,6 +251,7 @@ def _purge_synapse_cache( # instead of guessing how much space that we left, print out .synapseCache here logger.info(f"the total size of .synapseCache is: {nbytes} bytes") + @tracer.start_as_current_span("SynapseStorage::_query_fileview") def _query_fileview(self): self._purge_synapse_cache() try: @@ -412,6 +416,7 @@ def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str] return sorted_projects_list + @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject") def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: """Gets all datasets in folder under a given storage project that the current user has access to. @@ -456,6 +461,7 @@ def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: return sorted_dataset_list + @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset") def getFilesInStorageDataset( self, datasetId: str, fileNames: List = None, fullpath: bool = True ) -> List[Tuple[str, str]]: @@ -525,6 +531,7 @@ def _get_manifest_id(self, manifest: pd.DataFrame) -> str: return manifest_syn_id + @tracer.start_as_current_span("SynapseStorage::getDatasetManifest") def getDatasetManifest( self, datasetId: str, @@ -712,6 +719,7 @@ def fill_in_entity_id_filename( manifest = manifest.fillna("") return dataset_files, manifest + @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles") def updateDatasetManifestFiles( self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True ) -> Union[Tuple[str, pd.DataFrame], None]: @@ -794,6 +802,7 @@ def _get_file_entityIds( return files + @tracer.start_as_current_span("SynapseStorage::getProjectManifests") def getProjectManifests( self, projectId: str ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: @@ -1100,6 +1109,7 @@ def get_table_info(self, datasetId: str = None, projectId: str = None) -> List[s return {None: None} @missing_entity_handler + @tracer.start_as_current_span("SynapseStorage::uploadDB") def uploadDB( self, dmge: DataModelGraphExplorer, @@ -1147,6 +1157,7 @@ def uploadDB( return manifest_table_id, manifest, table_manifest + @tracer.start_as_current_span("SynapseStorage::formatDB") def formatDB(self, dmge, manifest, table_column_names): """ Method to format a manifest appropriatly for upload as table @@ -1209,6 +1220,7 @@ def formatDB(self, dmge, manifest, table_column_names): return col_schema, table_manifest + @tracer.start_as_current_span("SynapseStorage::buildDB") def buildDB( self, datasetId: str, @@ -1284,6 +1296,7 @@ def buildDB( return manifest_table_id + @tracer.start_as_current_span("SynapseStorage::upload_manifest_file") def upload_manifest_file( self, manifest, @@ -1604,6 +1617,7 @@ def _generate_table_name(self, manifest): table_name = "synapse_storage_manifest_table" return table_name, component_name + @tracer.start_as_current_span("SynapseStorage::_add_annotations") def _add_annotations( self, dmge, @@ -1653,6 +1667,7 @@ def _create_entity_id(self, idx, row, manifest, datasetId): manifest.loc[idx, "entityId"] = entityId return manifest, entityId + @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files") def add_annotations_to_entities_files( self, dmge, @@ -1717,6 +1732,7 @@ def add_annotations_to_entities_files( logger.info(f"Added annotations to entity: {entityId}") return manifest + @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table") def upload_manifest_as_table( self, dmge: DataModelGraphExplorer, @@ -1810,6 +1826,7 @@ def upload_manifest_as_table( self.syn.set_annotations(manifest_annotations) return manifest_synapse_file_id + @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv") def upload_manifest_as_csv( self, dmge, @@ -1868,6 +1885,7 @@ def upload_manifest_as_csv( return manifest_synapse_file_id + @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo") def upload_manifest_combo( self, dmge, @@ -1957,6 +1975,7 @@ def upload_manifest_combo( self.syn.set_annotations(manifest_annotations) return manifest_synapse_file_id + @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles") def associateMetadataWithFiles( self, dmge: DataModelGraphExplorer, From c554975399208f4077e741175f6028e9ffcbc53b Mon Sep 17 00:00:00 2001 From: linglp Date: Mon, 13 May 2024 12:16:29 -0400 Subject: [PATCH 032/110] add file exporter --- schematic_api/api/routes.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index 0855db7af..ebc52a431 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -44,22 +44,38 @@ from opentelemetry import trace from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.resources import SERVICE_NAME, Resource -from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter, SimpleSpanProcessor from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter - logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) trace.set_tracer_provider( TracerProvider( - resource=Resource(attributes={SERVICE_NAME: "schematic"}) + resource=Resource(attributes={SERVICE_NAME: "schematic-api"}) ) ) + + +class FileSpanExporter(ConsoleSpanExporter): + """Create an exporter for OTEL data to a file.""" + + def __init__(self, file_path) -> None: + """Init with a path.""" + self.file_path = file_path + + def export(self, spans) -> None: + """Export the spans to the file.""" + with open(self.file_path, "a", encoding="utf-8") as f: + for span in spans: + span_json_one_line = span.to_json().replace("\n", "") + "\n" + f.write(span_json_one_line) + trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) +processor = SimpleSpanProcessor(FileSpanExporter("otel_spans_schemati_api.json")) +trace.get_tracer_provider().add_span_processor(processor) tracer = trace.get_tracer("schematic-api") - def trace_function_params(): def decorator(func): @wraps(func) From c93250218c65b652ce04f0704df8af61062e09eb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 13 May 2024 11:17:31 -0700 Subject: [PATCH 033/110] removed typing extensions lock --- poetry.lock | 2 +- pyproject.toml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index 711fac94d..49550859a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4862,4 +4862,4 @@ aws = ["uWSGI"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.11" -content-hash = "579cc5febc6a29624324ab5845a8ee9a50b9fc8e2f1bdb987b24a7acdb4b7479" +content-hash = "5bf0c831977694ea541db24481181ec1980ec9589a2adbd9f30ed0fe7f2b2742" diff --git a/pyproject.toml b/pyproject.toml index ac6bf38c2..03b88c81e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,7 +67,6 @@ dateparser = "^1.1.4" pandarallel = "^1.6.4" schematic-db = {version = "0.0.41", extras = ["synapse"]} pyopenssl = {version = "^23.0.0", optional = true} -typing-extensions = "<4.6.0" dataclasses-json = "^0.6.1" pydantic = "^1.10.4" connexion = {extras = ["swagger-ui"], version = "^2.8.0", optional = true} From 0240dbe6c8010ef73473caf1153e74a5236ff746 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Mon, 13 May 2024 11:55:45 -0700 Subject: [PATCH 034/110] run black --- schematic/schemas/data_model_json_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/schemas/data_model_json_schema.py b/schematic/schemas/data_model_json_schema.py index b654ce77c..cb58102c7 100644 --- a/schematic/schemas/data_model_json_schema.py +++ b/schematic/schemas/data_model_json_schema.py @@ -407,4 +407,4 @@ def get_json_validation_schema( os.makedirs(json_schema_dirname, exist_ok=True) with open(json_schema_log_file_path, "w", encoding="UTF-8") as js_f: json.dump(json_schema, js_f, indent=2) - return json_schema # type: ignore + return json_schema # type: ignore From 834c2c3461c04e2e8d65a05f8ae6367b1a6b174a Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Fri, 31 May 2024 10:55:41 -0700 Subject: [PATCH 035/110] add version force param --- schematic/store/synapse.py | 5711 ++++++++++++++++++------------------ 1 file changed, 2857 insertions(+), 2854 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index a15137ae8..e52788896 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -1,2854 +1,2857 @@ -"""Synapse storage class""" - -import atexit -from copy import deepcopy -from dataclasses import dataclass -import logging -import numpy as np -import pandas as pd -import os -import re -import secrets -import shutil -import synapseclient -import uuid # used to generate unique names for entities - -from tenacity import ( - retry, - stop_after_attempt, - wait_chain, - wait_fixed, - retry_if_exception_type, -) -from time import sleep - -# allows specifying explicit variable types -from typing import Dict, List, Tuple, Sequence, Union, Optional - -from synapseclient import ( - Synapse, - File, - Folder, - Table, - Schema, - EntityViewSchema, - EntityViewType, - Column, - as_table_columns, -) -from synapseclient.entity import File -from synapseclient.table import CsvFileTable, build_table, Schema -from synapseclient.core.exceptions import ( - SynapseHTTPError, - SynapseAuthenticationError, - SynapseUnmetAccessRestrictions, - SynapseHTTPError, -) -import synapseutils - -from schematic_db.rdb.synapse_database import SynapseDatabase - -from schematic.schemas.data_model_graph import DataModelGraphExplorer - -from schematic.utils.df_utils import update_df, load_df, col_in_dataframe -from schematic.utils.validate_utils import comma_separated_list_regex, rule_in_rule_list - -# entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment -# Please do not remove these import statements -from schematic.utils.general import ( - entity_type_mapping, - get_dir_size, - create_temp_folder, - check_synapse_cache_size, - clear_synapse_cache, -) - -from schematic.utils.schema_utils import get_class_label_from_display_name - -from schematic.store.base import BaseStorage -from schematic.exceptions import AccessCredentialsError -from schematic.configuration.configuration import CONFIG - -logger = logging.getLogger("Synapse storage") - - -@dataclass -class ManifestDownload(object): - """ - syn: an object of type synapseclient. - manifest_id: id of a manifest - """ - - syn: synapseclient.Synapse - manifest_id: str - - def _download_manifest_to_folder(self) -> File: - """ - try downloading a manifest to local cache or a given folder - manifest - Return: - manifest_data: A Synapse file entity of the downloaded manifest - """ - if "SECRETS_MANAGER_SECRETS" in os.environ: - temporary_manifest_storage = "/var/tmp/temp_manifest_download" - # clear out all the existing manifests - if os.path.exists(temporary_manifest_storage): - shutil.rmtree(temporary_manifest_storage) - # create a new directory to store manifest - if not os.path.exists(temporary_manifest_storage): - os.mkdir(temporary_manifest_storage) - # create temporary folders for storing manifests - download_location = create_temp_folder(temporary_manifest_storage) - else: - download_location = CONFIG.manifest_folder - manifest_data = self.syn.get( - self.manifest_id, - downloadLocation=download_location, - ifcollision="overwrite.local", - ) - return manifest_data - - def _entity_type_checking(self) -> str: - """ - check the entity type of the id that needs to be downloaded - Return: - if the entity type is wrong, raise an error - """ - # check the type of entity - entity_type = entity_type_mapping(self.syn, self.manifest_id) - if entity_type != "file": - logger.error( - f"You are using entity type: {entity_type}. Please provide a file ID" - ) - - @staticmethod - def download_manifest( - self, newManifestName: str = "", manifest_df: pd.DataFrame = pd.DataFrame() - ) -> Union[str, File]: - """ - Download a manifest based on a given manifest id. - Args: - newManifestName(optional): new name of a manifest that gets downloaded. - manifest_df(optional): a dataframe containing name and id of manifests in a given asset view - Return: - manifest_data: synapse entity file object - """ - - # enables retrying if user does not have access to uncensored manifest - # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location - manifest_data = "" - - # check entity type - self._entity_type_checking() - - # download a manifest - try: - manifest_data = self._download_manifest_to_folder() - except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError): - # if there's an error getting an uncensored manifest, try getting the censored manifest - if not manifest_df.empty: - censored_regex = re.compile(".*censored.*") - censored = manifest_df["name"].str.contains(censored_regex) - new_manifest_id = manifest_df[censored]["id"][0] - self.manifest_id = new_manifest_id - try: - manifest_data = self._download_manifest_to_folder() - except ( - SynapseUnmetAccessRestrictions, - SynapseAuthenticationError, - ) as e: - raise PermissionError( - "You don't have access to censored and uncensored manifests in this dataset." - ) from e - else: - logger.error( - f"You don't have access to the requested resource: {self.manifest_id}" - ) - - if newManifestName and os.path.exists(manifest_data.get("path")): - # Rename the file we just made to the new name - new_manifest_filename = newManifestName + ".csv" - - # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest. - parent_folder = os.path.dirname(manifest_data.get("path")) - - new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename) - os.rename(manifest_data["path"], new_manifest_path_name) - - # Update file names/paths in manifest_data - manifest_data["name"] = new_manifest_filename - manifest_data["filename"] = new_manifest_filename - manifest_data["path"] = new_manifest_path_name - return manifest_data - - -class SynapseStorage(BaseStorage): - """Implementation of Storage interface for datasets/files stored on Synapse. - Provides utilities to list files in a specific project; update files annotations, create fileviews, etc. - - TODO: Need to define the interface and rename and/or refactor some of the methods below. - """ - - def __init__( - self, - token: Optional[str] = None, # optional parameter retrieved from browser cookie - access_token: Optional[str] = None, - project_scope: Optional[list] = None, - synapse_cache_path: Optional[str] = None, - ) -> None: - """Initializes a SynapseStorage object. - - Args: - token (Optional[str], optional): - Optional token parameter as found in browser cookie upon login to synapse. - Defaults to None. - access_token (Optional[list], optional): - Optional access token (personal or oauth). - Defaults to None. - project_scope (Optional[list], optional): Defaults to None. - synapse_cache_path (Optional[str], optional): - Location of synapse cache. - Defaults to None. - """ - self.syn = self.login(synapse_cache_path, token, access_token) - self.project_scope = project_scope - self.storageFileview = CONFIG.synapse_master_fileview_id - self.manifest = CONFIG.synapse_manifest_basename - self.root_synapse_cache = self.syn.cache.cache_root_dir - self._query_fileview() - - def _purge_synapse_cache( - self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15 - ) -> None: - """ - Purge synapse cache if it exceeds a certain size. Default to 1GB. - Args: - maximum_storage_allowed_cache_gb (int): the maximum storage allowed - before purging cache. Default is 1 GB. - minute_buffer (int): All files created this amount of time or older will be deleted - """ - # try clearing the cache - # scan a directory and check size of files - if os.path.exists(self.root_synapse_cache): - maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * ( - 1024**3 - ) - nbytes = get_dir_size(self.root_synapse_cache) - dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache) - # if 1 GB has already been taken, purge cache before 15 min - if dir_size_bytes >= maximum_storage_allowed_cache_bytes: - num_of_deleted_files = clear_synapse_cache( - self.syn.cache, minutes=minute_buffer - ) - logger.info( - f"{num_of_deleted_files} files have been deleted from {self.root_synapse_cache}" - ) - else: - # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB) - # instead of guessing how much space that we left, print out .synapseCache here - logger.info(f"the total size of .synapseCache is: {nbytes} bytes") - - def _query_fileview(self): - self._purge_synapse_cache() - try: - self.storageFileview = CONFIG.synapse_master_fileview_id - self.manifest = CONFIG.synapse_manifest_basename - if self.project_scope: - self.storageFileviewTable = self.syn.tableQuery( - f"SELECT * FROM {self.storageFileview} WHERE projectId IN {tuple(self.project_scope + [''])}" - ).asDataFrame() - else: - # get data in administrative fileview for this pipeline - self.storageFileviewTable = self.syn.tableQuery( - "SELECT * FROM " + self.storageFileview - ).asDataFrame() - except SynapseHTTPError: - raise AccessCredentialsError(self.storageFileview) - - @staticmethod - def login( - synapse_cache_path: Optional[str] = None, - token: Optional[str] = None, - access_token: Optional[str] = None, - ) -> synapseclient.Synapse: - """Login to Synapse - - Args: - token (Optional[str], optional): A Synapse token. Defaults to None. - access_token (Optional[str], optional): A synapse access token. Defaults to None. - synapse_cache_path (Optional[str]): location of synapse cache - - Raises: - ValueError: If unable to login with token - ValueError: If unable to loging with access token - - Returns: - synapseclient.Synapse: A Synapse object that is logged in - """ - # If no token is provided, try retrieving access token from environment - if not token and not access_token: - access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") - - # login using a token - if token: - syn = synapseclient.Synapse(cache_root_dir=synapse_cache_path) - try: - syn.login(sessionToken=token, silent=True) - except SynapseHTTPError as exc: - raise ValueError( - "Please make sure you are logged into synapse.org." - ) from exc - elif access_token: - try: - syn = synapseclient.Synapse(cache_root_dir=synapse_cache_path) - syn.default_headers["Authorization"] = f"Bearer {access_token}" - except SynapseHTTPError as exc: - raise ValueError( - "No access to resources. Please make sure that your token is correct" - ) from exc - else: - # login using synapse credentials provided by user in .synapseConfig (default) file - syn = synapseclient.Synapse( - configPath=CONFIG.synapse_configuration_path, - cache_root_dir=synapse_cache_path, - ) - syn.login(silent=True) - return syn - - def missing_entity_handler(method): - def wrapper(*args, **kwargs): - try: - return method(*args, **kwargs) - except SynapseHTTPError as ex: - str_message = str(ex).replace("\n", "") - if "trash" in str_message or "does not exist" in str_message: - logging.warning(str_message) - return None - else: - raise ex - - return wrapper - - def getStorageFileviewTable(self): - """Returns the storageFileviewTable obtained during initialization.""" - return self.storageFileviewTable - - def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]: - """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to. - - Args: - currentUserId: synapse id for the user whose projects we want to get. - - Returns: - A dictionary with a next page token and the results. - """ - all_results = self.syn.restGET( - "/projects/user/{principalId}".format(principalId=currentUserId) - ) - - while ( - "nextPageToken" in all_results - ): # iterate over next page token in results while there is any - results_token = self.syn.restGET( - "/projects/user/{principalId}?nextPageToken={nextPageToken}".format( - principalId=currentUserId, - nextPageToken=all_results["nextPageToken"], - ) - ) - all_results["results"].extend(results_token["results"]) - - if "nextPageToken" in results_token: - all_results["nextPageToken"] = results_token["nextPageToken"] - else: - del all_results["nextPageToken"] - - return all_results - - def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]: - """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute. - - Returns: - A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName). - """ - - # get the set of all storage Synapse project accessible for this pipeline - storageProjects = self.storageFileviewTable["projectId"].unique() - - # get the set of storage Synapse project accessible for this user - - # get current user name and user ID - currentUser = self.syn.getUserProfile() - currentUserName = currentUser.userName - currentUserId = currentUser.ownerId - - # get a list of projects from Synapse - currentUserProjects = self.getPaginatedRestResults(currentUserId) - - # prune results json filtering project id - currentUserProjects = [ - currentUserProject.get("id") - for currentUserProject in currentUserProjects["results"] - ] - - # find set of user projects that are also in this pipeline's storage projects set - storageProjects = list(set(storageProjects) & set(currentUserProjects)) - - # Limit projects to scope if specified - if project_scope: - storageProjects = list(set(storageProjects) & set(project_scope)) - - if not storageProjects: - raise Warning( - f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}" - ) - - # prepare a return list of project IDs and names - projects = [] - for projectId in storageProjects: - projectName = self.syn.get(projectId, downloadFile=False).name - projects.append((projectId, projectName)) - - sorted_projects_list = sorted(projects, key=lambda tup: tup[0]) - - return sorted_projects_list - - def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: - """Gets all datasets in folder under a given storage project that the current user has access to. - - Args: - projectId: synapse ID of a storage project. - - Returns: - A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). - None: If the projectId cannot be found on Synapse. - """ - - # select all folders and fetch their names from within the storage project; - # if folder content type is defined, only select folders that contain datasets - areDatasets = False - if "contentType" in self.storageFileviewTable.columns: - foldersTable = self.storageFileviewTable[ - (self.storageFileviewTable["contentType"] == "dataset") - & (self.storageFileviewTable["projectId"] == projectId) - ] - areDatasets = True - else: - foldersTable = self.storageFileviewTable[ - (self.storageFileviewTable["type"] == "folder") - & (self.storageFileviewTable["parentId"] == projectId) - ] - - # get an array of tuples (folderId, folderName) - # some folders are part of datasets; others contain datasets - # each dataset parent is the project; folders part of a dataset have another folder as a parent - # to get folders if and only if they contain datasets for each folder - # check if folder's parent is the project; if so that folder contains a dataset, - # unless the folder list has already been filtered to dataset folders based on contentType attribute above - - datasetList = [] - folderProperties = ["id", "name"] - for folder in list( - foldersTable[folderProperties].itertuples(index=False, name=None) - ): - datasetList.append(folder) - - sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0]) - - return sorted_dataset_list - - def getFilesInStorageDataset( - self, datasetId: str, fileNames: List = None, fullpath: bool = True - ) -> List[Tuple[str, str]]: - """Gets all files in a given dataset folder. - - Args: - datasetId: synapse ID of a storage dataset. - fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g. - metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present. - fullpath: if True return the full path as part of this filename; otherwise return just base filename - - Returns: - A list of files; the list consists of tuples (fileId, fileName). - - Raises: - ValueError: Dataset ID not found. - """ - # select all files within a given storage dataset folder (top level folder in a Synapse storage project or folder marked with contentType = 'dataset') - walked_path = synapseutils.walk( - self.syn, datasetId, includeTypes=["folder", "file"] - ) - - file_list = [] - - # iterate over all results - for dirpath, dirname, filenames in walked_path: - # iterate over all files in a folder - for filename in filenames: - if (not "manifest" in filename[0] and not fileNames) or ( - fileNames and filename[0] in fileNames - ): - # don't add manifest to list of files unless it is specified in the list of specified fileNames; return all found files - # except the manifest if no fileNames have been specified - # TODO: refactor for clarity/maintainability - - if fullpath: - # append directory path to filename - filename = (dirpath[0] + "/" + filename[0], filename[1]) - - # add file name file id tuple, rearranged so that id is first and name follows - file_list.append(filename[::-1]) - - return file_list - - def _get_manifest_id(self, manifest: pd.DataFrame) -> str: - """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one. - Args: - manifest: a dataframe contains name and id of manifests in a given asset view - - Return: - manifest_syn_id: id of a given censored or uncensored manifest - """ - censored_regex = re.compile(".*censored.*") - censored = manifest["name"].str.contains(censored_regex) - if any(censored): - # Try to use uncensored manifest first - not_censored = ~censored - if any(not_censored): - manifest_syn_id = manifest[not_censored]["id"].iloc[0] - # if only censored manifests are available, just use the first censored manifest - else: - manifest_syn_id = manifest["id"].iloc[0] - - # otherwise, use the first (implied only) version that exists - else: - manifest_syn_id = manifest["id"].iloc[0] - - return manifest_syn_id - - def getDatasetManifest( - self, - datasetId: str, - downloadFile: bool = False, - newManifestName: str = "", - ) -> Union[str, File]: - """Gets the manifest associated with a given dataset. - - Args: - datasetId: synapse ID of a storage dataset. - downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not. - newManifestName: new name of a manifest that gets downloaded - - Returns: - manifest_syn_id (String): Synapse ID of exisiting manifest file. - manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. - "" (String): No pre-exisiting manifest in dataset. - """ - manifest_data = "" - - # get a list of files containing the manifest for this dataset (if any) - all_files = self.storageFileviewTable - - # construct regex based on manifest basename in the config - manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv") - - # search manifest based on given manifest basename regex above - # and return a dataframe containing name and id of manifests in a given asset view - manifest = all_files[ - (all_files["name"].str.contains(manifest_re, regex=True)) - & (all_files["parentId"] == datasetId) - ] - - manifest = manifest[["id", "name"]] - - # if there is no pre-exisiting manifest in the specified dataset - if manifest.empty: - logger.warning( - f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}" - ) - return "" - - # if there is an exisiting manifest - else: - manifest_syn_id = self._get_manifest_id(manifest) - if downloadFile: - md = ManifestDownload(self.syn, manifest_id=manifest_syn_id) - manifest_data = ManifestDownload.download_manifest( - md, newManifestName=newManifestName, manifest_df=manifest - ) - ## TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string, - ## then we should catch the error here without returning an empty string. - if not manifest_data: - logger.debug( - f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}" - ) - return manifest_data - return manifest_syn_id - - def getDataTypeFromManifest(self, manifestId: str): - """Fetch a manifest and return data types of all columns - Args: - manifestId: synapse ID of a manifest - """ - # get manifest file path - manifest_filepath = self.syn.get(manifestId).path - - # load manifest dataframe - manifest = load_df( - manifest_filepath, - preserve_raw_input=False, - data_model=False, - ) - - # convert the dataFrame to use best possible dtypes. - manifest_new = manifest.convert_dtypes() - - # get data types of columns - result = manifest_new.dtypes.to_frame("dtypes").reset_index() - - # return the result as a dictionary - result_dict = result.set_index("index")["dtypes"].astype(str).to_dict() - - return result_dict - - def _get_files_metadata_from_dataset( - self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None - ) -> Optional[dict]: - """retrieve file ids under a particular datasetId - - Args: - datasetId (str): a dataset id - only_new_files (bool): if only adding new files that are not already exist - manifest (pd.DataFrame): metadata manifest dataframe. Default to None. - - Returns: - a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available - """ - dataset_files = self.getFilesInStorageDataset(datasetId) - if dataset_files: - dataset_file_names_id_dict = self._get_file_entityIds( - dataset_files, only_new_files=only_new_files, manifest=manifest - ) - return dataset_file_names_id_dict - else: - return None - - def add_entity_id_and_filename( - self, datasetId: str, manifest: pd.DataFrame - ) -> pd.DataFrame: - """add entityid and filename column to an existing manifest assuming entityId column is not already present - - Args: - datasetId (str): dataset syn id - manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty - - Returns: - pd.DataFrame: returns a pandas dataframe - """ - # get file names and entity ids of a given dataset - dataset_files_dict = self._get_files_metadata_from_dataset( - datasetId, only_new_files=False - ) - - if dataset_files_dict: - # turn manifest dataframe back to a dictionary for operation - manifest_dict = manifest.to_dict("list") - - # update Filename column - # add entityId column to the end - manifest_dict.update(dataset_files_dict) - - # if the component column exists in existing manifest, fill up that column - if "Component" in manifest_dict.keys(): - manifest_dict["Component"] = manifest_dict["Component"] * max( - 1, len(manifest_dict["Filename"]) - ) - - # turn dictionary back to a dataframe - manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index") - manifest_df_updated = manifest_df_index.transpose() - - # fill na with empty string - manifest_df_updated = manifest_df_updated.fillna("") - - # drop index - manifest_df_updated = manifest_df_updated.reset_index(drop=True) - - return manifest_df_updated - else: - return manifest - - def fill_in_entity_id_filename( - self, datasetId: str, manifest: pd.DataFrame - ) -> Tuple[List, pd.DataFrame]: - """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present. - - Args: - datasetId (str): dataset syn id - manifest (pd.DataFrame): existing manifest dataframe. - - Returns: - Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe - """ - # get dataset file names and entity id as a list of tuple - dataset_files = self.getFilesInStorageDataset(datasetId) - - # update manifest with additional filenames, if any - # note that if there is an existing manifest and there are files in the dataset - # the columns Filename and entityId are assumed to be present in manifest schema - # TODO: use idiomatic panda syntax - if dataset_files: - new_files = self._get_file_entityIds( - dataset_files=dataset_files, only_new_files=True, manifest=manifest - ) - - # update manifest so that it contains new dataset files - new_files = pd.DataFrame(new_files) - manifest = ( - pd.concat([manifest, new_files], sort=False) - .reset_index() - .drop("index", axis=1) - ) - - manifest = manifest.fillna("") - return dataset_files, manifest - - def updateDatasetManifestFiles( - self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True - ) -> Union[Tuple[str, pd.DataFrame], None]: - """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any. - - Args: - dmge: DataModelGraphExplorer Instance - datasetId: synapse ID of a storage dataset. - store: if set to True store updated manifest in asset store; if set to False - return a Pandas dataframe containing updated manifest but do not store to asset store - - - Returns: - Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. - If there is no existing manifest return None - """ - - # get existing manifest Synapse ID - manifest_id = self.getDatasetManifest(datasetId) - - # if there is no manifest return None - if not manifest_id: - return None - - manifest_filepath = self.syn.get(manifest_id).path - manifest = load_df(manifest_filepath) - - # update manifest with additional filenames, if any - # note that if there is an existing manifest and there are files in the dataset - # the columns Filename and entityId are assumed to be present in manifest schema - # TODO: use idiomatic panda syntax - - dataset_files, manifest = self.fill_in_entity_id_filename(datasetId, manifest) - if dataset_files: - # update the manifest file, so that it contains the relevant entity IDs - if store: - manifest.to_csv(manifest_filepath, index=False) - - # store manifest and update associated metadata with manifest on Synapse - manifest_id = self.associateMetadataWithFiles( - dmge, manifest_filepath, datasetId - ) - - return manifest_id, manifest - - def _get_file_entityIds( - self, - dataset_files: List, - only_new_files: bool = False, - manifest: pd.DataFrame = None, - ): - """ - Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files - - Args: - manifest: metadata manifest - dataset_file: List of all files in a dataset - only_new_files: boolean to control whether only new files are returned or all files in the dataset - Returns: - files: dictionary of file names and entityIDs, with scope as specified by `only_new_files` - """ - files = {"Filename": [], "entityId": []} - - if only_new_files: - if manifest is None: - raise UnboundLocalError( - "No manifest was passed in, a manifest is required when `only_new_files` is True." - ) - - # find new files (that are not in the current manifest) if any - for file_id, file_name in dataset_files: - if not file_id in manifest["entityId"].values: - files["Filename"].append(file_name) - files["entityId"].append(file_id) - else: - # get all files - for file_id, file_name in dataset_files: - files["Filename"].append(file_name) - files["entityId"].append(file_id) - - return files - - def getProjectManifests( - self, projectId: str - ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: - """Gets all metadata manifest files across all datasets in a specified project. - - Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest - as a list of tuples, one for each manifest: - [ - ( - (datasetId, dataName), - (manifestId, manifestName), - (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema - ), - ... - ] - - TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface - """ - component = None - entity = None - manifests = [] - - datasets = self.getStorageDatasetsInProject(projectId) - - for datasetId, datasetName in datasets: - # encode information about the manifest in a simple list (so that R clients can unpack it) - # eventually can serialize differently - - # Get synID of manifest for a dataset - manifestId = self.getDatasetManifest(datasetId) - - # If a manifest exists, get the annotations for it, else return base 'manifest' tuple - if manifestId: - annotations = self.getFileAnnotations(manifestId) - - # If manifest has annotations specifying component, use that - if annotations and "Component" in annotations: - component = annotations["Component"] - entity = self.syn.get(manifestId, downloadFile=False) - manifest_name = entity["properties"]["name"] - - # otherwise download the manifest and parse for information - elif not annotations or "Component" not in annotations: - logging.debug( - f"No component annotations have been found for manifest {manifestId}. " - "The manifest will be downloaded and parsed instead. " - "For increased speed, add component annotations to manifest." - ) - - manifest_info = self.getDatasetManifest( - datasetId, downloadFile=True - ) - manifest_name = manifest_info["properties"].get("name", "") - - if not manifest_name: - logger.error(f"Failed to download manifests from {datasetId}") - - manifest_path = manifest_info["path"] - - manifest_df = load_df(manifest_path) - - # Get component from component column if it exists - if ( - "Component" in manifest_df - and not manifest_df["Component"].empty - ): - list(set(manifest_df["Component"])) - component = list(set(manifest_df["Component"])) - - # Added to address issues raised during DCA testing - if "" in component: - component.remove("") - - if len(component) == 1: - component = component[0] - elif len(component) > 1: - logging.warning( - f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time." - "Behavior of manifests with multiple components is undefined" - ) - else: - manifest_name = "" - component = None - if component: - manifest = ( - (datasetId, datasetName), - (manifestId, manifest_name), - (component, component), - ) - elif manifestId: - logging.debug( - f"Manifest {manifestId} does not have an associated Component" - ) - manifest = ( - (datasetId, datasetName), - (manifestId, manifest_name), - ("", ""), - ) - else: - manifest = ( - (datasetId, datasetName), - ("", ""), - ("", ""), - ) - - if manifest: - manifests.append(manifest) - - return manifests - - def upload_project_manifests_to_synapse( - self, dmge: DataModelGraphExplorer, projectId: str - ) -> List[str]: - """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse. - - Returns: String of all the manifest_table_ids of all the manifests that have been loaded. - """ - - manifests = [] - manifest_loaded = [] - datasets = self.getStorageDatasetsInProject(projectId) - - for datasetId, datasetName in datasets: - # encode information about the manifest in a simple list (so that R clients can unpack it) - # eventually can serialize differently - - manifest = ((datasetId, datasetName), ("", ""), ("", "")) - - manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) - if manifest_info: - manifest_id = manifest_info["properties"]["id"] - manifest_name = manifest_info["properties"]["name"] - manifest_path = manifest_info["path"] - manifest_df = load_df(manifest_path) - manifest_table_id = uploadDB( - dmge=dmge, - manifest=manifest, - datasetId=datasetId, - table_name=datasetName, - ) - manifest_loaded.append(datasetName) - return manifest_loaded - - def upload_annotated_project_manifests_to_synapse( - self, projectId: str, path_to_json_ld: str, dry_run: bool = False - ) -> List[str]: - """ - Purpose: - For all manifests in a project, upload them as a table and add annotations manifest csv. - Assumes the manifest is already present as a CSV in a dataset in the project. - - """ - # Instantiate DataModelParser - data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld) - # Parse Model - parsed_data_model = data_model_parser.parse_model() - - # Instantiate DataModelGraph - data_model_grapher = DataModelGraph(parsed_data_model) - - # Generate graph - graph_data_model = data_model_grapher.generate_data_model_graph() - - # Instantiate DataModelGraphExplorer - dmge = DataModelGraphExplorer(graph_data_model) - - manifests = [] - manifest_loaded = [] - datasets = self.getStorageDatasetsInProject(projectId) - for datasetId, datasetName in datasets: - # encode information about the manifest in a simple list (so that R clients can unpack it) - # eventually can serialize differently - - manifest = ((datasetId, datasetName), ("", ""), ("", "")) - manifests.append(manifest) - - manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) - - if manifest_info: - manifest_id = manifest_info["properties"]["id"] - manifest_name = manifest_info["properties"]["name"] - manifest_path = manifest_info["path"] - manifest = ( - (datasetId, datasetName), - (manifest_id, manifest_name), - ("", ""), - ) - if not dry_run: - manifest_syn_id = self.associateMetadataWithFiles( - dmge, manifest_path, datasetId, manifest_record_type="table" - ) - manifest_loaded.append(manifest) - - return manifests, manifest_loaded - - def move_entities_to_new_project( - self, - projectId: str, - newProjectId: str, - returnEntities: bool = False, - dry_run: bool = False, - ): - """ - For each manifest csv in a project, look for all the entitiy ids that are associated. - Look up the entitiy in the files, move the entity to new project. - """ - - manifests = [] - manifest_loaded = [] - datasets = self.getStorageDatasetsInProject(projectId) - if datasets: - for datasetId, datasetName in datasets: - # encode information about the manifest in a simple list (so that R clients can unpack it) - # eventually can serialize differently - - manifest = ((datasetId, datasetName), ("", ""), ("", "")) - manifests.append(manifest) - - manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) - if manifest_info: - manifest_id = manifest_info["properties"]["id"] - manifest_name = manifest_info["properties"]["name"] - manifest_path = manifest_info["path"] - manifest_df = load_df(manifest_path) - - manifest = ( - (datasetId, datasetName), - (manifest_id, manifest_name), - ("", ""), - ) - manifest_loaded.append(manifest) - - annotation_entities = self.storageFileviewTable[ - (self.storageFileviewTable["id"].isin(manifest_df["entityId"])) - & (self.storageFileviewTable["type"] == "folder") - ]["id"] - - if returnEntities: - for entityId in annotation_entities: - if not dry_run: - self.syn.move(entityId, datasetId) - else: - logging.info( - f"{entityId} will be moved to folder {datasetId}." - ) - else: - # generate project folder - archive_project_folder = Folder( - projectId + "_archive", parent=newProjectId - ) - archive_project_folder = self.syn.store(archive_project_folder) - - # generate dataset folder - dataset_archive_folder = Folder( - "_".join([datasetId, datasetName, "archive"]), - parent=archive_project_folder.id, - ) - dataset_archive_folder = self.syn.store(dataset_archive_folder) - - for entityId in annotation_entities: - # move entities to folder - if not dry_run: - self.syn.move(entityId, dataset_archive_folder.id) - else: - logging.info( - f"{entityId} will be moved to folder {dataset_archive_folder.id}." - ) - else: - raise LookupError( - f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry." - ) - return manifests, manifest_loaded - - def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]: - """Download synapse table as a pd dataframe; return table schema and etags as results too - - Args: - synapse_id: synapse ID of the table to query - """ - - results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) - df = results.asDataFrame(rowIdAndVersionInIndex=False) - - return df, results - - def _get_tables(self, datasetId: str = None, projectId: str = None) -> List[Table]: - if projectId: - project = projectId - elif datasetId: - project = self.syn.get(self.getDatasetProject(datasetId)) - - return list(self.syn.getChildren(project, includeTypes=["table"])) - - def get_table_info(self, datasetId: str = None, projectId: str = None) -> List[str]: - """Gets the names of the tables in the schema - Can pass in a synID for a dataset or project - Returns: - list[str]: A list of table names - """ - tables = self._get_tables(datasetId=datasetId, projectId=projectId) - if tables: - return {table["name"]: table["id"] for table in tables} - else: - return {None: None} - - @missing_entity_handler - def uploadDB( - self, - dmge: DataModelGraphExplorer, - manifest: pd.DataFrame, - datasetId: str, - table_name: str, - restrict: bool = False, - table_manipulation: str = "replace", - table_column_names: str = "class_label", - ): - """ - Method to upload a database to an asset store. In synapse, this will upload a metadata table - - Args: - dmge: DataModelGraphExplorer object - manifest: pd.Df manifest to upload - datasetId: synID of the dataset for the manifest - table_name: name of the table to be uploaded - restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions - existingTableId: str of the synId of the existing table, if one already exists - table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) - table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display - name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain - display label formatting. - Returns: - manifest_table_id: synID of the uploaded table - manifest: the original manifset - table_manifest: manifest formatted appropriately for the table - - """ - - col_schema, table_manifest = self.formatDB( - dmge=dmge, manifest=manifest, table_column_names=table_column_names - ) - - manifest_table_id = self.buildDB( - datasetId, - table_name, - col_schema, - table_manifest, - table_manipulation, - dmge, - restrict, - ) - - return manifest_table_id, manifest, table_manifest - - def formatDB(self, dmge, manifest, table_column_names): - """ - Method to format a manifest appropriatly for upload as table - - Args: - dmge: DataModelGraphExplorer object - manifest: pd.Df manifest to upload - table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display - name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain - display label formatting. - Returns: - col_schema: schema for table columns: type, size, etc - table_manifest: formatted manifest - - """ - # Rename the manifest columns to display names to match fileview - - blacklist_chars = ["(", ")", ".", " ", "-"] - manifest_columns = manifest.columns.tolist() - - table_manifest = deepcopy(manifest) - - if table_column_names == "display_name": - cols = table_manifest.columns - - elif table_column_names == "display_label": - cols = [ - str(col).translate({ord(x): "" for x in blacklist_chars}) - for col in manifest_columns - ] - - elif table_column_names == "class_label": - cols = [ - get_class_label_from_display_name(str(col)).translate( - {ord(x): "" for x in blacklist_chars} - ) - for col in manifest_columns - ] - else: - ValueError( - f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only." - ) - - cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols)) - - # Reset column names in table manifest - table_manifest.columns = cols - - # move entity id to end of df - entity_col = table_manifest.pop("entityId") - table_manifest.insert(len(table_manifest.columns), "entityId", entity_col) - - # Get the column schema - col_schema = as_table_columns(table_manifest) - - # Set Id column length to 64 (for some reason not being auto set.) - for i, col in enumerate(col_schema): - if col["name"].lower() == "id": - col_schema[i]["maximumSize"] = 64 - - return col_schema, table_manifest - - def buildDB( - self, - datasetId: str, - table_name: str, - col_schema: List, - table_manifest: pd.DataFrame, - table_manipulation: str, - dmge: DataModelGraphExplorer, - restrict: bool = False, - ): - """ - Method to construct the table appropriately: create new table, replace existing, or upsert new into existing - Calls TableOperations class to execute - - Args: - datasetId: synID of the dataset for the manifest - table_name: name of the table to be uploaded - col_schema: schema for table columns: type, size, etc from `formatDB` - table_manifest: formatted manifest that can be uploaded as a table - table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) - restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions - - Returns: - manifest_table_id: synID of the uploaded table - - """ - table_info = self.get_table_info(datasetId=datasetId) - # Put table manifest onto synapse - schema = Schema( - name=table_name, - columns=col_schema, - parent=self.getDatasetProject(datasetId), - ) - - if table_name in table_info: - existingTableId = table_info[table_name] - else: - existingTableId = None - - tableOps = TableOperations( - synStore=self, - tableToLoad=table_manifest, - tableName=table_name, - datasetId=datasetId, - existingTableId=existingTableId, - restrict=restrict, - ) - - if not table_manipulation or table_name not in table_info.keys(): - manifest_table_id = tableOps.createTable( - columnTypeDict=col_schema, - specifySchema=True, - ) - elif table_name in table_info.keys() and table_info[table_name]: - if table_manipulation.lower() == "replace": - manifest_table_id = tableOps.replaceTable( - specifySchema=True, - columnTypeDict=col_schema, - ) - elif table_manipulation.lower() == "upsert": - manifest_table_id = tableOps.upsertTable( - dmge=dmge, - ) - elif table_manipulation.lower() == "update": - manifest_table_id = tableOps.updateTable() - - if table_manipulation and table_manipulation.lower() == "upsert": - existing_tables = self.get_table_info(datasetId=datasetId) - tableId = existing_tables[table_name] - annos = self.syn.get_annotations(tableId) - annos["primary_key"] = table_manifest["Component"][0] + "_id" - annos = self.syn.set_annotations(annos) - - return manifest_table_id - - def upload_manifest_file( - self, - manifest, - metadataManifestPath, - datasetId, - restrict_manifest, - component_name="", - ): - # Update manifest to have the new entityId column - manifest.to_csv(metadataManifestPath, index=False) - - # store manifest to Synapse as a CSV - # update file name - file_name_full = metadataManifestPath.split("/")[-1] - file_extension = file_name_full.split(".")[-1] - - # Differentiate "censored" and "uncensored" manifest - if "censored" in file_name_full: - file_name_new = ( - os.path.basename(CONFIG.synapse_manifest_basename) - + "_" - + component_name - + "_censored" - + "." - + file_extension - ) - else: - file_name_new = ( - os.path.basename(CONFIG.synapse_manifest_basename) - + "_" - + component_name - + "." - + file_extension - ) - - manifestSynapseFile = File( - metadataManifestPath, - description="Manifest for dataset " + datasetId, - parent=datasetId, - name=file_name_new, - ) - manifest_synapse_file_id = self.syn.store( - manifestSynapseFile, isRestricted=restrict_manifest - ).id - - synapseutils.copy_functions.changeFileMetaData( - syn=self.syn, entity=manifest_synapse_file_id, downloadAs=file_name_new - ) - - return manifest_synapse_file_id - - @missing_entity_handler - def format_row_annotations( - self, dmge, row, entityId: str, hideBlanks: bool, annotation_keys: str - ): - # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis) - # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest - # this could create a divergence between manifest column and annotations. this should be ok for most use cases. - # columns with special characters are outside of the schema - metadataSyn = {} - blacklist_chars = ["(", ")", ".", " ", "-"] - - for k, v in row.to_dict().items(): - if annotation_keys == "display_label": - keySyn = str(k).translate({ord(x): "" for x in blacklist_chars}) - elif annotation_keys == "class_label": - keySyn = get_class_label_from_display_name(str(k)).translate( - {ord(x): "" for x in blacklist_chars} - ) - - # Skip `Filename` and `ETag` columns when setting annotations - if keySyn in ["Filename", "ETag", "eTag"]: - continue - - # truncate annotation values to 500 characters if the - # size of values is greater than equal to 500 characters - # add an explicit [truncatedByDataCuratorApp] message at the end - # of every truncated message to indicate that the cell value - # has been truncated - if isinstance(v, str) and len(v) >= 500: - v = v[0:472] + "[truncatedByDataCuratorApp]" - - metadataSyn[keySyn] = v - # set annotation(s) for the various objects/items in a dataset on Synapse - annos = self.syn.get_annotations(entityId) - csv_list_regex = comma_separated_list_regex() - for anno_k, anno_v in metadataSyn.items(): - # Remove keys with nan or empty string values from dict of annotations to be uploaded - # if present on current data annotation - if hideBlanks and ( - anno_v == "" or (isinstance(anno_v, float) and np.isnan(anno_v)) - ): - annos.pop(anno_k) if anno_k in annos.keys() else annos - # Otherwise save annotation as approrpriate - else: - if isinstance(anno_v, float) and np.isnan(anno_v): - annos[anno_k] = "" - elif ( - isinstance(anno_v, str) - and re.fullmatch(csv_list_regex, anno_v) - and rule_in_rule_list( - "list", dmge.get_node_validation_rules(anno_k) - ) - ): - annos[anno_k] = anno_v.split(",") - else: - annos[anno_k] = anno_v - - return annos - - @missing_entity_handler - def format_manifest_annotations(self, manifest, manifest_synapse_id): - """ - Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. - For now just getting the Component. - """ - - entity = self.syn.get(manifest_synapse_id, downloadFile=False) - is_file = entity.concreteType.endswith(".FileEntity") - is_table = entity.concreteType.endswith(".TableEntity") - - if is_file: - # Get file metadata - metadata = self.getFileAnnotations(manifest_synapse_id) - - # If there is a defined component add it to the metadata. - if "Component" in manifest.columns: - # Gather component information - component = manifest["Component"].unique() - - # Double check that only a single component is listed, else raise an error. - try: - len(component) == 1 - except ValueError as err: - raise ValueError( - f"Manifest has more than one component. Please check manifest and resubmit." - ) from err - - # Add component to metadata - metadata["Component"] = component[0] - - elif is_table: - # Get table metadata - metadata = self.getTableAnnotations(manifest_synapse_id) - - # Get annotations - annos = self.syn.get_annotations(manifest_synapse_id) - - # Add metadata to the annotations - for annos_k, annos_v in metadata.items(): - annos[annos_k] = annos_v - - return annos - - ''' - def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath, - useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False): - """ - Purpose: - Works very similarly to associateMetadataWithFiles except takes in the manifest - rather than the manifest path - - """ - - # Add uuid for table updates and fill. - if not "Uuid" in manifest.columns: - manifest["Uuid"] = '' - - for idx,row in manifest.iterrows(): - if not row["Uuid"]: - gen_uuid = uuid.uuid4() - row["Uuid"] = gen_uuid - manifest.loc[idx, 'Uuid'] = gen_uuid - - # add entityId as a column if not already there or - # fill any blanks with an empty string. - if not "entityId" in manifest.columns: - manifest["entityId"] = "" - else: - manifest["entityId"].fillna("", inplace=True) - - # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations - dmge = DataModelGraphExplorer() - - # Create table name here. - if 'Component' in manifest.columns: - table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table' - else: - table_name = 'synapse_storage_manifest_table' - - # Upload manifest as a table and get the SynID and manifest - manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table( - dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,) - - # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed - # also set metadata for each synapse entity as Synapse annotations - for idx, row in manifest.iterrows(): - if not row["entityId"]: - # If not using entityIds, fill with manifest_table_id so - row["entityId"] = manifest_synapse_table_id - entityId = '' - else: - # get the entity id corresponding to this row - entityId = row["entityId"] - - # Load manifest to synapse as a CSV File - manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest) - - # Get annotations for the file manifest. - manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id) - - self.syn.set_annotations(manifest_annotations) - - logger.info("Associated manifest file with dataset on Synapse.") - - # Update manifest Synapse table with new entity id column. - self.make_synapse_table( - table_to_load = table_manifest, - dataset_id = datasetId, - existingTableId = manifest_synapse_table_id, - table_name = table_name, - update_col = 'Uuid', - specify_schema = False, - ) - - # Get annotations for the table manifest - manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id) - self.syn.set_annotations(manifest_annotations) - return manifest_synapse_table_id - ''' - - def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame: - """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing. - Args: - metadataManifestPath (str): path where manifest is stored - Returns: - manifest(pd.DataFrame): Manifest loaded as a pandas dataframe - Raises: - FileNotFoundError: Manifest file does not exist at provided path. - """ - # read new manifest csv - try: - load_args = { - "dtype": "string", - } - manifest = load_df( - metadataManifestPath, - preserve_raw_input=False, - allow_na_values=False, - **load_args, - ) - except FileNotFoundError as err: - raise FileNotFoundError( - f"No manifest file was found at this path: {metadataManifestPath}" - ) from err - return manifest - - def _add_id_columns_to_manifest( - self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer - ): - """Helper function to add id and entityId columns to the manifest if they do not already exist, Fill id values per row. - Args: - Manifest loaded as a pd.Dataframe - Returns (pd.DataFrame): - Manifest df with new Id and EntityId columns (and UUID values) if they were not already present. - """ - - # Add Id for table updates and fill. - if not col_in_dataframe("Id", manifest): - # See if schema has `Uuid` column specified - try: - uuid_col_in_schema = dmge.is_class_in_schema( - "Uuid" - ) or dmge.is_class_in_schema("uuid") - except KeyError: - uuid_col_in_schema = False - - # Rename `Uuid` column if it wasn't specified in the schema - if col_in_dataframe("Uuid", manifest) and not uuid_col_in_schema: - manifest.rename(columns={"Uuid": "Id"}, inplace=True) - # If no `Uuid` column exists or it is specified in the schema, create a new `Id` column - else: - manifest["Id"] = "" - - # Retrieve the ID column name (id, Id and ID) are treated the same. - id_col_name = [col for col in manifest.columns if col.lower() == "id"][0] - - # Check if values have been added to the Id coulumn, if not add a UUID so value in the row is not blank. - for idx, row in manifest.iterrows(): - if not row[id_col_name]: - gen_uuid = str(uuid.uuid4()) - row[id_col_name] = gen_uuid - manifest.loc[idx, id_col_name] = gen_uuid - - # add entityId as a column if not already there or - # fill any blanks with an empty string. - if not col_in_dataframe("entityId", manifest): - manifest["entityId"] = "" - else: - manifest["entityId"].fillna("", inplace=True) - - return manifest - - def _generate_table_name(self, manifest): - """Helper function to generate a table name for upload to synapse. - Args: - Manifest loaded as a pd.Dataframe - Returns: - table_name (str): Name of the table to load - component_name (str): Name of the manifest component (if applicable) - """ - # Create table name here. - if "Component" in manifest.columns: - component_name = manifest["Component"][0].lower() - table_name = component_name + "_synapse_storage_manifest_table" - else: - component_name = "" - table_name = "synapse_storage_manifest_table" - return table_name, component_name - - def _add_annotations( - self, - dmge, - row, - entityId: str, - hideBlanks: bool, - annotation_keys: str, - ): - """Helper function to format and add annotations to entities in Synapse. - Args: - dmge: DataModelGraphExplorer object, - row: current row of manifest being processed - entityId (str): synapseId of entity to add annotations to - hideBlanks: Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. - annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display - name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain - display label formatting while ensuring the label is formatted properly for Synapse annotations. - Returns: - Annotations are added to entities in Synapse, no return. - """ - # Format annotations for Synapse - annos = self.format_row_annotations( - dmge, row, entityId, hideBlanks, annotation_keys - ) - - if annos: - # Store annotations for an entity folder - self.syn.set_annotations(annos) - return - - def _create_entity_id(self, idx, row, manifest, datasetId): - """Helper function to generate an entityId and add it to the appropriate row in the manifest. - Args: - row: current row of manifest being processed - manifest (pd.DataFrame): loaded df containing user supplied data. - datasetId (str): synapse ID of folder containing the dataset - - Returns: - manifest (pd.DataFrame): manifest with entityId added to the appropriate row - entityId (str): Generated Entity Id. - - """ - rowEntity = Folder(str(uuid.uuid4()), parent=datasetId) - rowEntity = self.syn.store(rowEntity) - entityId = rowEntity["id"] - row["entityId"] = entityId - manifest.loc[idx, "entityId"] = entityId - return manifest, entityId - - def add_annotations_to_entities_files( - self, - dmge, - manifest, - manifest_record_type: str, - datasetId: str, - hideBlanks: bool, - manifest_synapse_table_id="", - annotation_keys: str = "class_label", - ): - """Depending on upload type add Ids to entityId row. Add anotations to connected files. - Args: - dmge: DataModelGraphExplorer Object - manifest (pd.DataFrame): loaded df containing user supplied data. - manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. - datasetId (str): synapse ID of folder containing the dataset - hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. - manifest_synapse_table_id (str): Default is an empty string ''. - annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display - name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain - display label formatting while ensuring the label is formatted properly for Synapse annotations. - Returns: - manifest (pd.DataFrame): modified to add entitiyId as appropriate - - """ - - # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting - if "filename" in [col.lower() for col in manifest.columns]: - # get current list of files and store as dataframe - dataset_files = self.getFilesInStorageDataset(datasetId) - files_and_entityIds = self._get_file_entityIds( - dataset_files=dataset_files, only_new_files=False - ) - file_df = pd.DataFrame(files_and_entityIds) - - # Merge dataframes to add entityIds - manifest = manifest.merge( - file_df, how="left", on="Filename", suffixes=["_x", None] - ).drop("entityId_x", axis=1) - - # Fill `entityId` for each row if missing and annotate entity as appropriate - for idx, row in manifest.iterrows(): - if not row["entityId"] and ( - manifest_record_type == "file_and_entities" - or manifest_record_type == "table_file_and_entities" - ): - manifest, entityId = self._create_entity_id( - idx, row, manifest, datasetId - ) - elif not row["entityId"] and manifest_record_type == "table_and_file": - # If not using entityIds, fill with manifest_table_id so - row["entityId"] = manifest_synapse_table_id - manifest.loc[idx, "entityId"] = manifest_synapse_table_id - entityId = "" - else: - # get the file id of the file to annotate, collected in above step. - entityId = row["entityId"] - - # Adding annotations to connected files. - if entityId: - self._add_annotations(dmge, row, entityId, hideBlanks, annotation_keys) - logger.info(f"Added annotations to entity: {entityId}") - return manifest - - def upload_manifest_as_table( - self, - dmge: DataModelGraphExplorer, - manifest: pd.DataFrame, - metadataManifestPath: str, - datasetId: str, - table_name: str, - component_name: str, - restrict: bool, - manifest_record_type: str, - hideBlanks: bool, - table_manipulation: str, - table_column_names: str, - annotation_keys: str, - file_annotations_upload: bool = True, - ): - """Upload manifest to Synapse as a table and csv. - Args: - dmge: DataModelGraphExplorer object - manifest (pd.DataFrame): loaded df containing user supplied data. - metadataManifestPath: path to csv containing a validated metadata manifest. - datasetId (str): synapse ID of folder containing the dataset - table_name (str): Generated to name the table being uploaded. - component_name (str): Name of the component manifest that is currently being uploaded. - restrict (bool): Flag for censored data. - manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. - hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. - table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. - table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display - name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain - display label formatting. - annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display - name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain - display label formatting while ensuring the label is formatted properly for Synapse annotations. - file_annotations_upload (bool): Default to True. If false, do not add annotations to files. - Return: - manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. - """ - # Upload manifest as a table, get the ID and updated manifest. - manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( - dmge=dmge, - manifest=manifest, - datasetId=datasetId, - table_name=table_name, - restrict=restrict, - table_manipulation=table_manipulation, - table_column_names=table_column_names, - ) - - if file_annotations_upload: - manifest = self.add_annotations_to_entities_files( - dmge, - manifest, - manifest_record_type, - datasetId, - hideBlanks, - manifest_synapse_table_id, - annotation_keys, - ) - # Load manifest to synapse as a CSV File - manifest_synapse_file_id = self.upload_manifest_file( - manifest, - metadataManifestPath, - datasetId, - restrict, - component_name=component_name, - ) - - # Set annotations for the file manifest. - manifest_annotations = self.format_manifest_annotations( - manifest, manifest_synapse_file_id - ) - self.syn.set_annotations(manifest_annotations) - logger.info("Associated manifest file with dataset on Synapse.") - - # Update manifest Synapse table with new entity id column. - manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( - dmge=dmge, - manifest=manifest, - datasetId=datasetId, - table_name=table_name, - restrict=restrict, - table_manipulation="update", - table_column_names=table_column_names, - ) - - # Set annotations for the table manifest - manifest_annotations = self.format_manifest_annotations( - manifest, manifest_synapse_table_id - ) - self.syn.set_annotations(manifest_annotations) - return manifest_synapse_file_id - - def upload_manifest_as_csv( - self, - dmge, - manifest, - metadataManifestPath, - datasetId, - restrict, - manifest_record_type, - hideBlanks, - component_name, - annotation_keys: str, - file_annotations_upload: bool = True, - ): - """Upload manifest to Synapse as a csv only. - Args: - dmge: DataModelGraphExplorer object - manifest (pd.DataFrame): loaded df containing user supplied data. - metadataManifestPath: path to csv containing a validated metadata manifest. - datasetId (str): synapse ID of folder containing the dataset - restrict (bool): Flag for censored data. - manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. - hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. - annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display - name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain - display label formatting while ensuring the label is formatted properly for Synapse annotations. - file_annotations_upload (bool): Default to True. If false, do not add annotations to files. - Return: - manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. - """ - if file_annotations_upload: - manifest = self.add_annotations_to_entities_files( - dmge, - manifest, - manifest_record_type, - datasetId, - hideBlanks, - annotation_keys=annotation_keys, - ) - - # Load manifest to synapse as a CSV File - manifest_synapse_file_id = self.upload_manifest_file( - manifest, - metadataManifestPath, - datasetId, - restrict, - component_name=component_name, - ) - - # Set annotations for the file manifest. - manifest_annotations = self.format_manifest_annotations( - manifest, manifest_synapse_file_id - ) - self.syn.set_annotations(manifest_annotations) - - logger.info("Associated manifest file with dataset on Synapse.") - - return manifest_synapse_file_id - - def upload_manifest_combo( - self, - dmge, - manifest, - metadataManifestPath, - datasetId, - table_name, - component_name, - restrict, - manifest_record_type, - hideBlanks, - table_manipulation, - table_column_names: str, - annotation_keys: str, - file_annotations_upload: bool = True, - ): - """Upload manifest to Synapse as a table and CSV with entities. - Args: - dmge: DataModelGraphExplorer object - manifest (pd.DataFrame): loaded df containing user supplied data. - metadataManifestPath: path to csv containing a validated metadata manifest. - datasetId (str): synapse ID of folder containing the dataset - table_name (str): Generated to name the table being uploaded. - component_name (str): Name of the component manifest that is currently being uploaded. - restrict (bool): Flag for censored data. - manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. - hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. - table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. - table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display - name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain - display label formatting. - annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display - name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain - display label formatting while ensuring the label is formatted properly for Synapse annotations. - file_annotations_upload (bool): Default to True. If false, do not add annotations to files. - Return: - manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. - """ - manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( - dmge=dmge, - manifest=manifest, - datasetId=datasetId, - table_name=table_name, - restrict=restrict, - table_manipulation=table_manipulation, - table_column_names=table_column_names, - ) - - if file_annotations_upload: - manifest = self.add_annotations_to_entities_files( - dmge, - manifest, - manifest_record_type, - datasetId, - hideBlanks, - manifest_synapse_table_id, - annotation_keys=annotation_keys, - ) - - # Load manifest to synapse as a CSV File - manifest_synapse_file_id = self.upload_manifest_file( - manifest, metadataManifestPath, datasetId, restrict, component_name - ) - - # Set annotations for the file manifest. - manifest_annotations = self.format_manifest_annotations( - manifest, manifest_synapse_file_id - ) - self.syn.set_annotations(manifest_annotations) - logger.info("Associated manifest file with dataset on Synapse.") - - # Update manifest Synapse table with new entity id column. - manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( - dmge=dmge, - manifest=manifest, - datasetId=datasetId, - table_name=table_name, - restrict=restrict, - table_manipulation="update", - table_column_names=table_column_names, - ) - - # Set annotations for the table manifest - manifest_annotations = self.format_manifest_annotations( - manifest, manifest_synapse_table_id - ) - self.syn.set_annotations(manifest_annotations) - return manifest_synapse_file_id - - def associateMetadataWithFiles( - self, - dmge: DataModelGraphExplorer, - metadataManifestPath: str, - datasetId: str, - manifest_record_type: str = "table_file_and_entities", - hideBlanks: bool = False, - restrict_manifest=False, - table_manipulation: str = "replace", - table_column_names: str = "class_label", - annotation_keys: str = "class_label", - file_annotations_upload: bool = True, - ) -> str: - """Associate metadata with files in a storage dataset already on Synapse. - Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file. - - If this is a new manifest there could be no Synapse entities associated with the rows of this manifest - this may be due to data type (e.g. clinical data) being tabular - and not requiring files; to utilize uniform interfaces downstream - (i.e. fileviews), a Synapse entity (a folder) is created for each row - and an entity column is added to the manifest containing the resulting - entity IDs; a table is also created at present as an additional interface - for downstream query and interaction with the data. - - Args: - dmge: DataModelGraphExplorer Object - metadataManifestPath: path to csv containing a validated metadata manifest. - The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type. - Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item. - In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file. - datasetId: synapse ID of folder containing the dataset - manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination. - hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. - restrict_manifest (bool): Default is false. Flag for censored data. - table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. - table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display - name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain - display label formatting. - annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display - name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain - display label formatting while ensuring the label is formatted properly for Synapse annotations. - Returns: - manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. - """ - # Read new manifest CSV: - manifest = self._read_manifest(metadataManifestPath) - manifest = self._add_id_columns_to_manifest(manifest, dmge) - - table_name, component_name = self._generate_table_name(manifest) - - # Upload manifest to synapse based on user input (manifest_record_type) - if manifest_record_type == "file_only": - manifest_synapse_file_id = self.upload_manifest_as_csv( - dmge, - manifest, - metadataManifestPath, - datasetId=datasetId, - restrict=restrict_manifest, - hideBlanks=hideBlanks, - manifest_record_type=manifest_record_type, - component_name=component_name, - annotation_keys=annotation_keys, - file_annotations_upload=file_annotations_upload, - ) - elif manifest_record_type == "table_and_file": - manifest_synapse_file_id = self.upload_manifest_as_table( - dmge, - manifest, - metadataManifestPath, - datasetId=datasetId, - table_name=table_name, - component_name=component_name, - restrict=restrict_manifest, - hideBlanks=hideBlanks, - manifest_record_type=manifest_record_type, - table_manipulation=table_manipulation, - table_column_names=table_column_names, - annotation_keys=annotation_keys, - file_annotations_upload=file_annotations_upload, - ) - elif manifest_record_type == "file_and_entities": - manifest_synapse_file_id = self.upload_manifest_as_csv( - dmge, - manifest, - metadataManifestPath, - datasetId=datasetId, - restrict=restrict_manifest, - hideBlanks=hideBlanks, - manifest_record_type=manifest_record_type, - component_name=component_name, - annotation_keys=annotation_keys, - file_annotations_upload=file_annotations_upload, - ) - elif manifest_record_type == "table_file_and_entities": - manifest_synapse_file_id = self.upload_manifest_combo( - dmge, - manifest, - metadataManifestPath, - datasetId=datasetId, - table_name=table_name, - component_name=component_name, - restrict=restrict_manifest, - hideBlanks=hideBlanks, - manifest_record_type=manifest_record_type, - table_manipulation=table_manipulation, - table_column_names=table_column_names, - annotation_keys=annotation_keys, - file_annotations_upload=file_annotations_upload, - ) - else: - raise ValueError("Please enter a valid manifest_record_type.") - return manifest_synapse_file_id - - def getTableAnnotations(self, table_id: str): - """Generate dictionary of annotations for the given Synapse file. - Synapse returns all custom annotations as lists since they - can contain multiple values. In all cases, the values will - be converted into strings and concatenated with ", ". - - Args: - fileId (str): Synapse ID for dataset file. - - Returns: - dict: Annotations as comma-separated strings. - """ - try: - entity = self.syn.get(table_id, downloadFile=False) - is_table = entity.concreteType.endswith(".TableEntity") - annotations_raw = entity.annotations - except SynapseHTTPError: - # If an error occurs with retrieving entity, skip it - # This could be caused by a temporary file view that - # was deleted since its ID was retrieved - is_file, is_table = False, False - - # Skip anything that isn't a file or folder - if not (is_table): - return None - - annotations = self.getEntityAnnotations(table_id, entity, annotations_raw) - - return annotations - - def getFileAnnotations(self, fileId: str) -> Dict[str, str]: - """Generate dictionary of annotations for the given Synapse file. - Synapse returns all custom annotations as lists since they - can contain multiple values. In all cases, the values will - be converted into strings and concatenated with ", ". - - Args: - fileId (str): Synapse ID for dataset file. - - Returns: - dict: Annotations as comma-separated strings. - """ - - # Get entity metadata, including annotations - try: - entity = self.syn.get(fileId, downloadFile=False) - is_file = entity.concreteType.endswith(".FileEntity") - is_folder = entity.concreteType.endswith(".Folder") - annotations_raw = entity.annotations - except SynapseHTTPError: - # If an error occurs with retrieving entity, skip it - # This could be caused by a temporary file view that - # was deleted since its ID was retrieved - is_file, is_folder = False, False - - # Skip anything that isn't a file or folder - if not (is_file or is_folder): - return None - - annotations = self.getEntityAnnotations(fileId, entity, annotations_raw) - - return annotations - - def getEntityAnnotations(self, fileId, entity, annotations_raw): - # Extract annotations from their lists and stringify. For example: - # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']} - annotations = dict() - for key, vals in annotations_raw.items(): - if isinstance(vals, list) and len(vals) == 1: - annotations[key] = str(vals[0]) - else: - annotations[key] = ", ".join(str(v) for v in vals) - - # Add the file entity ID and eTag, which weren't lists - assert fileId == entity.id, ( - "For some reason, the Synapse ID in the response doesn't match" - "the Synapse ID sent in the request (via synapseclient)." - ) - annotations["entityId"] = fileId - annotations["eTag"] = entity.etag - - return annotations - - def getDatasetAnnotations( - self, datasetId: str, fill_na: bool = True, force_batch: bool = False - ) -> pd.DataFrame: - """Generate table for annotations across all files in given dataset. - - Args: - datasetId (str): Synapse ID for dataset folder. - fill_na (bool): Whether to replace missing values with - blank strings. - force_batch (bool): Whether to force the function to use - the batch mode, which uses a file view to retrieve - annotations for a given dataset. Default to False - unless there are more than 50 files in the dataset. - - Returns: - pd.DataFrame: Table of annotations. - """ - # Get all files in given dataset - dataset_files = self.getFilesInStorageDataset(datasetId) - - # if there are no dataset files, there are no annotations - # return None - if not dataset_files: - return pd.DataFrame() - - dataset_files_map = dict(dataset_files) - dataset_file_ids, _ = list(zip(*dataset_files)) - - # Get annotations for each file from Step 1 - # Batch mode - try_batch = len(dataset_files) >= 50 or force_batch - if try_batch: - try: - logger.info("Trying batch mode for retrieving Synapse annotations") - table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) - except (SynapseAuthenticationError, SynapseHTTPError): - logger.info( - f"Unable to create a temporary file view bound to {datasetId}. " - "Defaulting to slower iterative retrieval of annotations." - ) - # Default to the slower non-batch method - logger.info("Batch mode failed (probably due to permission error)") - try_batch = False - - # Non-batch mode - if not try_batch: - logger.info("Using slower (non-batch) sequential mode") - records = [self.getFileAnnotations(i) for i in dataset_file_ids] - # Remove any annotations for non-file/folders (stored as None) - records = filter(None, records) - table = pd.DataFrame.from_records(records) - - # Add filenames for the files that "survived" annotation retrieval - filenames = [dataset_files_map[i] for i in table["entityId"]] - - if "Filename" not in table.columns: - table.insert(0, "Filename", filenames) - - # Ensure that entityId and eTag are at the end - entity_ids = table.pop("entityId") - etags = table.pop("eTag") - table.insert(len(table.columns), "entityId", entity_ids) - table.insert(len(table.columns), "eTag", etags) - - # Missing values are filled in with empty strings for Google Sheets - if fill_na: - table.fillna("", inplace=True) - - # Force all values as strings - return table.astype(str) - - def raise_final_error(retry_state): - return retry_state.outcome.result() - - def checkIfinAssetView(self, syn_id) -> str: - # get data in administrative fileview for this pipeline - assetViewTable = self.getStorageFileviewTable() - all_files = list(assetViewTable["id"]) - if syn_id in all_files: - return True - else: - return False - - @retry( - stop=stop_after_attempt(5), - wait=wait_chain( - *[wait_fixed(10) for i in range(2)] - + [wait_fixed(15) for i in range(2)] - + [wait_fixed(20)] - ), - retry=retry_if_exception_type(LookupError), - retry_error_callback=raise_final_error, - ) - def getDatasetProject(self, datasetId: str) -> str: - """Get parent project for a given dataset ID. - - Args: - datasetId (str): Synapse entity ID (folder or project). - - Raises: - ValueError: Raised if Synapse ID cannot be retrieved - by the user or if it doesn't appear in the file view. - - Returns: - str: The Synapse ID for the parent project. - """ - - # Subset main file view - dataset_index = self.storageFileviewTable["id"] == datasetId - dataset_row = self.storageFileviewTable[dataset_index] - - # re-query if no datasets found - if dataset_row.empty: - sleep(5) - self._query_fileview() - # Subset main file view - dataset_index = self.storageFileviewTable["id"] == datasetId - dataset_row = self.storageFileviewTable[dataset_index] - - # Return `projectId` for given row if only one found - if len(dataset_row) == 1: - dataset_project = dataset_row["projectId"].values[0] - return dataset_project - - # Otherwise, check if already project itself - try: - syn_object = self.syn.get(datasetId) - if syn_object.properties["concreteType"].endswith("Project"): - return datasetId - except SynapseHTTPError: - raise PermissionError( - f"The given dataset ({datasetId}) isn't accessible with this " - "user. This might be caused by a typo in the dataset Synapse ID." - ) - - # If not, then assume dataset not in file view - raise LookupError( - f"The given dataset ({datasetId}) doesn't appear in the " - f"configured file view ({self.storageFileview}). This might " - "mean that the file view's scope needs to be updated." - ) - - def getDatasetAnnotationsBatch( - self, datasetId: str, dataset_file_ids: Sequence[str] = None - ) -> pd.DataFrame: - """Generate table for annotations across all files in given dataset. - This function uses a temporary file view to generate a table - instead of iteratively querying for individual entity annotations. - This function is expected to run much faster than - `self.getDatasetAnnotationsBatch` on large datasets. - - Args: - datasetId (str): Synapse ID for dataset folder. - dataset_file_ids (Sequence[str]): List of Synapse IDs - for dataset files/folders used to subset the table. - - Returns: - pd.DataFrame: Table of annotations. - """ - # Create data frame from annotations file view - with DatasetFileView(datasetId, self.syn) as fileview: - table = fileview.query() - - if dataset_file_ids: - table = table.loc[table.index.intersection(dataset_file_ids)] - - table = table.reset_index(drop=True) - - return table - - def _get_table_schema_by_cname(self, table_schema): - # assume no duplicate column names in the table - table_schema_by_cname = {} - - for col_record in table_schema: - # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key) - table_schema_by_cname[col_record["name"]] = col_record - - return table_schema_by_cname - - -class TableOperations: - """ - Object to hold functions for various table operations specific to the Synapse Asset Store. - - Currently implement operations are: - createTable: upload a manifest as a new table when none exist - replaceTable: replace a metadata in a table from one manifest with metadata from another manifest - updateTable: add a column to a table that already exists on synapse - - Operations currently in development are: - upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest - """ - - def __init__( - self, - synStore: SynapseStorage, - tableToLoad: pd.DataFrame = None, - tableName: str = None, - datasetId: str = None, - existingTableId: str = None, - restrict: bool = False, - ): - """ - Class governing table operations (creation, replacement, upserts, updates) in schematic - - tableToLoad: manifest formatted appropriately for the table - tableName: name of the table to be uploaded - datasetId: synID of the dataset for the manifest - existingTableId: synId of the table currently exising on synapse (if there is one) - restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions - - """ - self.synStore = synStore - self.tableToLoad = tableToLoad - self.tableName = tableName - self.datasetId = datasetId - self.existingTableId = existingTableId - self.restrict = restrict - - def createTable( - self, - columnTypeDict: dict = None, - specifySchema: bool = True, - ): - """ - Method to create a table from a metadata manifest and upload it to synapse - - Args: - columnTypeDict: dictionary schema for table columns: type, size, etc - specifySchema: to specify a specific schema for the table format - - Returns: - table.schema.id: synID of the newly created table - """ - - datasetEntity = self.synStore.syn.get(self.datasetId, downloadFile=False) - datasetName = datasetEntity.name - table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) - - if not self.tableName: - self.tableName = datasetName + "table" - datasetParentProject = self.synStore.getDatasetProject(self.datasetId) - if specifySchema: - if columnTypeDict == {}: - logger.error("Did not provide a columnTypeDict.") - # create list of columns: - cols = [] - for col in self.tableToLoad.columns: - if col in table_schema_by_cname: - col_type = table_schema_by_cname[col]["columnType"] - max_size = ( - table_schema_by_cname[col]["maximumSize"] - if "maximumSize" in table_schema_by_cname[col].keys() - else 100 - ) - max_list_len = 250 - if max_size and max_list_len: - cols.append( - Column( - name=col, - columnType=col_type, - maximumSize=max_size, - maximumListLength=max_list_len, - ) - ) - elif max_size: - cols.append( - Column(name=col, columnType=col_type, maximumSize=max_size) - ) - else: - cols.append(Column(name=col, columnType=col_type)) - else: - # TODO add warning that the given col was not found and it's max size is set to 100 - cols.append(Column(name=col, columnType="STRING", maximumSize=100)) - schema = Schema( - name=self.tableName, columns=cols, parent=datasetParentProject - ) - table = Table(schema, self.tableToLoad) - table = self.synStore.syn.store(table, isRestricted=self.restrict) - return table.schema.id - else: - # For just uploading the tables to synapse using default - # column types. - table = build_table(self.tableName, datasetParentProject, self.tableToLoad) - table = self.synStore.syn.store(table, isRestricted=self.restrict) - return table.schema.id - - def replaceTable( - self, - specifySchema: bool = True, - columnTypeDict: dict = None, - ): - """ - Method to replace an existing table on synapse with metadata from a new manifest - - Args: - specifySchema: to infer a schema for the table format - columnTypeDict: dictionary schema for table columns: type, size, etc - - Returns: - existingTableId: synID of the already existing table that had its metadata replaced - """ - datasetEntity = self.synStore.syn.get(self.datasetId, downloadFile=False) - datasetName = datasetEntity.name - table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) - existing_table, existing_results = self.synStore.get_synapse_table( - self.existingTableId - ) - # remove rows - self.synStore.syn.delete(existing_results) - # wait for row deletion to finish on synapse before getting empty table - sleep(10) - - # removes all current columns - current_table = self.synStore.syn.get(self.existingTableId) - current_columns = self.synStore.syn.getTableColumns(current_table) - for col in current_columns: - current_table.removeColumn(col) - - if not self.tableName: - self.tableName = datasetName + "table" - - # Process columns according to manifest entries - table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) - datasetParentProject = self.synStore.getDatasetProject(self.datasetId) - if specifySchema: - if columnTypeDict == {}: - logger.error("Did not provide a columnTypeDict.") - # create list of columns: - cols = [] - - for col in self.tableToLoad.columns: - if col in table_schema_by_cname: - col_type = table_schema_by_cname[col]["columnType"] - max_size = ( - table_schema_by_cname[col]["maximumSize"] - if "maximumSize" in table_schema_by_cname[col].keys() - else 100 - ) - max_list_len = 250 - if max_size and max_list_len: - cols.append( - Column( - name=col, - columnType=col_type, - maximumSize=max_size, - maximumListLength=max_list_len, - ) - ) - elif max_size: - cols.append( - Column(name=col, columnType=col_type, maximumSize=max_size) - ) - else: - cols.append(Column(name=col, columnType=col_type)) - else: - # TODO add warning that the given col was not found and it's max size is set to 100 - cols.append(Column(name=col, columnType="STRING", maximumSize=100)) - - # adds new columns to schema - for col in cols: - current_table.addColumn(col) - self.synStore.syn.store(current_table, isRestricted=self.restrict) - - # wait for synapse store to finish - sleep(1) - - # build schema and table from columns and store with necessary restrictions - schema = Schema( - name=self.tableName, columns=cols, parent=datasetParentProject - ) - schema.id = self.existingTableId - table = Table(schema, self.tableToLoad, etag=existing_results.etag) - table = self.synStore.syn.store(table, isRestricted=self.restrict) - else: - logging.error("Must specify a schema for table replacements") - - # remove system metadata from manifest - existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True) - return self.existingTableId - - def _get_auth_token( - self, - ): - authtoken = None - - # Get access token from environment variable if available - # Primarily useful for testing environments, with other possible usefulness for containers - env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") - if env_access_token: - authtoken = env_access_token - return authtoken - - # Get token from authorization header - # Primarily useful for API endpoint functionality - if "Authorization" in self.synStore.syn.default_headers: - authtoken = self.synStore.syn.default_headers["Authorization"].split( - "Bearer " - )[-1] - return authtoken - - # retrive credentials from synapse object - # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe - synapse_object_creds = self.synStore.syn.credentials - if hasattr(synapse_object_creds, "_token"): - authtoken = synapse_object_creds.secret - - # Try getting creds from .synapseConfig file if it exists - # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in - if os.path.exists(CONFIG.synapse_configuration_path): - config = self.synStore.syn.getConfigFile(CONFIG.synapse_configuration_path) - - # check which credentials are provided in file - if config.has_option("authentication", "authtoken"): - authtoken = config.get("authentication", "authtoken") - - # raise error if required credentials are not found - if not authtoken: - raise NameError( - "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file" - ) - - return authtoken - - def upsertTable(self, dmge: DataModelGraphExplorer): - """ - Method to upsert rows from a new manifest into an existing table on synapse - For upsert functionality to work, primary keys must follow the naming convention of _id - `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. - Currently it is required to use -dl/--use_display_label with table upserts. - - - Args: - dmge: DataModelGraphExplorer instance - - Returns: - existingTableId: synID of the already existing table that had its metadata replaced - """ - - authtoken = self._get_auth_token() - - synapseDB = SynapseDatabase( - auth_token=authtoken, - project_id=self.synStore.getDatasetProject(self.datasetId), - ) - - try: - # Try performing upsert - synapseDB.upsert_table_rows( - table_name=self.tableName, data=self.tableToLoad - ) - except SynapseHTTPError as ex: - # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload - if "Id is not a valid column name or id" in str(ex): - self._update_table_uuid_column(dmge) - synapseDB.upsert_table_rows( - table_name=self.tableName, data=self.tableToLoad - ) - # Raise if other error - else: - raise ex - - return self.existingTableId - - def _update_table_uuid_column( - self, - dmge: DataModelGraphExplorer, - ) -> None: - """Removes the `Uuid` column when present, and relpaces with an `Id` column - Used to enable backwards compatability for manifests using the old `Uuid` convention - - Args: - dmge: DataModelGraphExplorer instance - - Returns: - None - """ - - # Get the columns of the schema - schema = self.synStore.syn.get(self.existingTableId) - cols = self.synStore.syn.getTableColumns(schema) - - # Iterate through columns until `Uuid` column is found - for col in cols: - if col.name.lower() == "uuid": - # See if schema has `Uuid` column specified - try: - uuid_col_in_schema = dmge.is_class_in_schema(col.name) - except KeyError: - uuid_col_in_schema = False - - # If there is, then create a new `Id` column from scratch - if uuid_col_in_schema: - new_col = Column(columnType="STRING", maximumSize=64, name="Id") - schema.addColumn(new_col) - schema = self.synStore.syn.store(schema) - # If there is not, then use the old `Uuid` column as a basis for the new `Id` column - else: - # Build ColumnModel that will be used for new column - id_column = Column( - name="Id", - columnType="STRING", - maximumSize=64, - defaultValue=None, - maximumListLength=1, - ) - new_col_response = self.synStore.syn.store(id_column) - - # Define columnChange body - columnChangeDict = { - "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest", - "entityId": self.existingTableId, - "changes": [ - { - "oldColumnId": col["id"], - "newColumnId": new_col_response["id"], - } - ], - } - - self.synStore.syn._async_table_update( - table=self.existingTableId, - changes=[columnChangeDict], - wait=False, - ) - break - - return - - def updateTable( - self, - update_col: str = "Id", - ): - """ - Method to update an existing table with a new column - - Args: - updateCol: column to index the old and new tables on - - Returns: - existingTableId: synID of the already existing table that had its metadata replaced - """ - existing_table, existing_results = self.synStore.get_synapse_table( - self.existingTableId - ) - - self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col) - # store table with existing etag data and impose restrictions as appropriate - self.synStore.syn.store( - Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag), - isRestricted=self.restrict, - ) - - return self.existingTableId - - -class DatasetFileView: - """Helper class to create temporary dataset file views. - This class can be used in conjunction with a 'with' statement. - This will ensure that the file view is deleted automatically. - See SynapseStorage.getDatasetAnnotationsBatch for example usage. - """ - - def __init__( - self, - datasetId: str, - synapse: Synapse, - name: str = None, - temporary: bool = True, - parentId: str = None, - ) -> None: - """Create a file view scoped to a dataset folder. - - Args: - datasetId (str): Synapse ID for a dataset folder/project. - synapse (Synapse): Used for Synapse requests. - name (str): Name of the file view (temporary or not). - temporary (bool): Whether to delete the file view on exit - of either a 'with' statement or Python entirely. - parentId (str, optional): Synapse ID specifying where to - store the file view. Defaults to datasetId. - """ - - self.datasetId = datasetId - self.synapse = synapse - self.is_temporary = temporary - - if name is None: - self.name = f"schematic annotation file view for {self.datasetId}" - - if self.is_temporary: - uid = secrets.token_urlsafe(5) - self.name = f"{self.name} - UID {uid}" - - # TODO: Allow a DCC admin to configure a "universal parent" - # Such as a Synapse project writeable by everyone. - self.parentId = datasetId if parentId is None else parentId - - # TODO: Create local sharing setting to hide from everyone else - view_schema = EntityViewSchema( - name=self.name, - parent=self.parentId, - scopes=self.datasetId, - includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER], - addDefaultViewColumns=False, - addAnnotationColumns=True, - ) - - # TODO: Handle failure due to insufficient permissions by - # creating a temporary new project to store view - self.view_schema = self.synapse.store(view_schema) - - # These are filled in after calling `self.query()` - self.results = None - self.table = None - - # Ensure deletion of the file view (last resort) - if self.is_temporary: - atexit.register(self.delete) - - def __enter__(self): - """Return file view when entering 'with' statement.""" - return self - - def __exit__(self, exc_type, exc_value, traceback): - """Delete file view when exiting 'with' statement.""" - if self.is_temporary: - self.delete() - - def delete(self): - """Delete the file view on Synapse without deleting local table.""" - if self.view_schema is not None: - self.synapse.delete(self.view_schema) - self.view_schema = None - - def query(self, tidy=True, force=False): - """Retrieve file view as a data frame (raw format sans index).""" - if self.table is None or force: - fileview_id = self.view_schema["id"] - self.results = self.synapse.tableQuery(f"select * from {fileview_id}") - self.table = self.results.asDataFrame(rowIdAndVersionInIndex=False) - if tidy: - self.tidy_table() - return self.table - - def tidy_table(self): - """Convert raw file view data frame into more usable format.""" - assert self.table is not None, "Must call `self.query()` first." - self._fix_default_columns() - self._fix_list_columns() - self._fix_int_columns() - return self.table - - def _fix_default_columns(self): - """Rename default columns to match schematic expectations.""" - - # Drop ROW_VERSION column if present - if "ROW_VERSION" in self.table: - del self.table["ROW_VERSION"] - - # Rename id column to entityId and set as data frame index - if "ROW_ID" in self.table: - self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str) - self.table = self.table.set_index("entityId", drop=False) - del self.table["ROW_ID"] - - # Rename ROW_ETAG column to eTag and place at end of data frame - if "ROW_ETAG" in self.table: - row_etags = self.table.pop("ROW_ETAG") - self.table.insert(len(self.table.columns), "eTag", row_etags) - - return self.table - - def _get_columns_of_type(self, types): - """Helper function to get list of columns of a given type(s).""" - matching_columns = [] - for header in self.results.headers: - if header.columnType in types: - matching_columns.append(header.name) - return matching_columns - - def _fix_list_columns(self): - """Fix formatting of list-columns.""" - list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"} - list_columns = self._get_columns_of_type(list_types) - for col in list_columns: - self.table[col] = self.table[col].apply(lambda x: ", ".join(x)) - return self.table - - def _fix_int_columns(self): - """Ensure that integer-columns are actually integers.""" - int_columns = self._get_columns_of_type({"INTEGER"}) - for col in int_columns: - # Coercing to string because NaN is a floating point value - # and cannot exist alongside integers in a column - to_int_fn = lambda x: "" if np.isnan(x) else str(int(x)) - self.table[col] = self.table[col].apply(to_int_fn) - return self.table +"""Synapse storage class""" + +import atexit +from copy import deepcopy +from dataclasses import dataclass +import logging +import numpy as np +import pandas as pd +import os +import re +import secrets +import shutil +import synapseclient +import uuid # used to generate unique names for entities + +from tenacity import ( + retry, + stop_after_attempt, + wait_chain, + wait_fixed, + retry_if_exception_type, +) +from time import sleep + +# allows specifying explicit variable types +from typing import Dict, List, Tuple, Sequence, Union, Optional + +from synapseclient import ( + Synapse, + File, + Folder, + Table, + Schema, + EntityViewSchema, + EntityViewType, + Column, + as_table_columns, +) +from synapseclient.entity import File +from synapseclient.table import CsvFileTable, build_table, Schema +from synapseclient.core.exceptions import ( + SynapseHTTPError, + SynapseAuthenticationError, + SynapseUnmetAccessRestrictions, + SynapseHTTPError, +) +import synapseutils + +from schematic_db.rdb.synapse_database import SynapseDatabase + +from schematic.schemas.data_model_graph import DataModelGraphExplorer + +from schematic.utils.df_utils import update_df, load_df, col_in_dataframe +from schematic.utils.validate_utils import comma_separated_list_regex, rule_in_rule_list + +# entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment +# Please do not remove these import statements +from schematic.utils.general import ( + entity_type_mapping, + get_dir_size, + create_temp_folder, + check_synapse_cache_size, + clear_synapse_cache, +) + +from schematic.utils.schema_utils import get_class_label_from_display_name + +from schematic.store.base import BaseStorage +from schematic.exceptions import AccessCredentialsError +from schematic.configuration.configuration import CONFIG + +logger = logging.getLogger("Synapse storage") + + +@dataclass +class ManifestDownload(object): + """ + syn: an object of type synapseclient. + manifest_id: id of a manifest + """ + + syn: synapseclient.Synapse + manifest_id: str + + def _download_manifest_to_folder(self) -> File: + """ + try downloading a manifest to local cache or a given folder + manifest + Return: + manifest_data: A Synapse file entity of the downloaded manifest + """ + if "SECRETS_MANAGER_SECRETS" in os.environ: + temporary_manifest_storage = "/var/tmp/temp_manifest_download" + # clear out all the existing manifests + if os.path.exists(temporary_manifest_storage): + shutil.rmtree(temporary_manifest_storage) + # create a new directory to store manifest + if not os.path.exists(temporary_manifest_storage): + os.mkdir(temporary_manifest_storage) + # create temporary folders for storing manifests + download_location = create_temp_folder(temporary_manifest_storage) + else: + download_location = CONFIG.manifest_folder + manifest_data = self.syn.get( + self.manifest_id, + downloadLocation=download_location, + ifcollision="overwrite.local", + ) + return manifest_data + + def _entity_type_checking(self) -> str: + """ + check the entity type of the id that needs to be downloaded + Return: + if the entity type is wrong, raise an error + """ + # check the type of entity + entity_type = entity_type_mapping(self.syn, self.manifest_id) + if entity_type != "file": + logger.error( + f"You are using entity type: {entity_type}. Please provide a file ID" + ) + + @staticmethod + def download_manifest( + self, newManifestName: str = "", manifest_df: pd.DataFrame = pd.DataFrame() + ) -> Union[str, File]: + """ + Download a manifest based on a given manifest id. + Args: + newManifestName(optional): new name of a manifest that gets downloaded. + manifest_df(optional): a dataframe containing name and id of manifests in a given asset view + Return: + manifest_data: synapse entity file object + """ + + # enables retrying if user does not have access to uncensored manifest + # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location + manifest_data = "" + + # check entity type + self._entity_type_checking() + + # download a manifest + try: + manifest_data = self._download_manifest_to_folder() + except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError): + # if there's an error getting an uncensored manifest, try getting the censored manifest + if not manifest_df.empty: + censored_regex = re.compile(".*censored.*") + censored = manifest_df["name"].str.contains(censored_regex) + new_manifest_id = manifest_df[censored]["id"][0] + self.manifest_id = new_manifest_id + try: + manifest_data = self._download_manifest_to_folder() + except ( + SynapseUnmetAccessRestrictions, + SynapseAuthenticationError, + ) as e: + raise PermissionError( + "You don't have access to censored and uncensored manifests in this dataset." + ) from e + else: + logger.error( + f"You don't have access to the requested resource: {self.manifest_id}" + ) + + if newManifestName and os.path.exists(manifest_data.get("path")): + # Rename the file we just made to the new name + new_manifest_filename = newManifestName + ".csv" + + # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest. + parent_folder = os.path.dirname(manifest_data.get("path")) + + new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename) + os.rename(manifest_data["path"], new_manifest_path_name) + + # Update file names/paths in manifest_data + manifest_data["name"] = new_manifest_filename + manifest_data["filename"] = new_manifest_filename + manifest_data["path"] = new_manifest_path_name + return manifest_data + + +class SynapseStorage(BaseStorage): + """Implementation of Storage interface for datasets/files stored on Synapse. + Provides utilities to list files in a specific project; update files annotations, create fileviews, etc. + + TODO: Need to define the interface and rename and/or refactor some of the methods below. + """ + + def __init__( + self, + token: Optional[str] = None, # optional parameter retrieved from browser cookie + access_token: Optional[str] = None, + project_scope: Optional[list] = None, + synapse_cache_path: Optional[str] = None, + ) -> None: + """Initializes a SynapseStorage object. + + Args: + token (Optional[str], optional): + Optional token parameter as found in browser cookie upon login to synapse. + Defaults to None. + access_token (Optional[list], optional): + Optional access token (personal or oauth). + Defaults to None. + project_scope (Optional[list], optional): Defaults to None. + synapse_cache_path (Optional[str], optional): + Location of synapse cache. + Defaults to None. + """ + self.syn = self.login(synapse_cache_path, token, access_token) + self.project_scope = project_scope + self.storageFileview = CONFIG.synapse_master_fileview_id + self.manifest = CONFIG.synapse_manifest_basename + self.root_synapse_cache = self.syn.cache.cache_root_dir + self._query_fileview() + + def _purge_synapse_cache( + self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15 + ) -> None: + """ + Purge synapse cache if it exceeds a certain size. Default to 1GB. + Args: + maximum_storage_allowed_cache_gb (int): the maximum storage allowed + before purging cache. Default is 1 GB. + minute_buffer (int): All files created this amount of time or older will be deleted + """ + # try clearing the cache + # scan a directory and check size of files + if os.path.exists(self.root_synapse_cache): + maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * ( + 1024**3 + ) + nbytes = get_dir_size(self.root_synapse_cache) + dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache) + # if 1 GB has already been taken, purge cache before 15 min + if dir_size_bytes >= maximum_storage_allowed_cache_bytes: + num_of_deleted_files = clear_synapse_cache( + self.syn.cache, minutes=minute_buffer + ) + logger.info( + f"{num_of_deleted_files} files have been deleted from {self.root_synapse_cache}" + ) + else: + # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB) + # instead of guessing how much space that we left, print out .synapseCache here + logger.info(f"the total size of .synapseCache is: {nbytes} bytes") + + def _query_fileview(self): + self._purge_synapse_cache() + try: + self.storageFileview = CONFIG.synapse_master_fileview_id + self.manifest = CONFIG.synapse_manifest_basename + if self.project_scope: + self.storageFileviewTable = self.syn.tableQuery( + f"SELECT * FROM {self.storageFileview} WHERE projectId IN {tuple(self.project_scope + [''])}" + ).asDataFrame() + else: + # get data in administrative fileview for this pipeline + self.storageFileviewTable = self.syn.tableQuery( + "SELECT * FROM " + self.storageFileview + ).asDataFrame() + except SynapseHTTPError: + raise AccessCredentialsError(self.storageFileview) + + @staticmethod + def login( + synapse_cache_path: Optional[str] = None, + token: Optional[str] = None, + access_token: Optional[str] = None, + ) -> synapseclient.Synapse: + """Login to Synapse + + Args: + token (Optional[str], optional): A Synapse token. Defaults to None. + access_token (Optional[str], optional): A synapse access token. Defaults to None. + synapse_cache_path (Optional[str]): location of synapse cache + + Raises: + ValueError: If unable to login with token + ValueError: If unable to loging with access token + + Returns: + synapseclient.Synapse: A Synapse object that is logged in + """ + # If no token is provided, try retrieving access token from environment + if not token and not access_token: + access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") + + # login using a token + if token: + syn = synapseclient.Synapse(cache_root_dir=synapse_cache_path) + try: + syn.login(sessionToken=token, silent=True) + except SynapseHTTPError as exc: + raise ValueError( + "Please make sure you are logged into synapse.org." + ) from exc + elif access_token: + try: + syn = synapseclient.Synapse(cache_root_dir=synapse_cache_path) + syn.default_headers["Authorization"] = f"Bearer {access_token}" + except SynapseHTTPError as exc: + raise ValueError( + "No access to resources. Please make sure that your token is correct" + ) from exc + else: + # login using synapse credentials provided by user in .synapseConfig (default) file + syn = synapseclient.Synapse( + configPath=CONFIG.synapse_configuration_path, + cache_root_dir=synapse_cache_path, + ) + syn.login(silent=True) + return syn + + def missing_entity_handler(method): + def wrapper(*args, **kwargs): + try: + return method(*args, **kwargs) + except SynapseHTTPError as ex: + str_message = str(ex).replace("\n", "") + if "trash" in str_message or "does not exist" in str_message: + logging.warning(str_message) + return None + else: + raise ex + + return wrapper + + def getStorageFileviewTable(self): + """Returns the storageFileviewTable obtained during initialization.""" + return self.storageFileviewTable + + def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]: + """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to. + + Args: + currentUserId: synapse id for the user whose projects we want to get. + + Returns: + A dictionary with a next page token and the results. + """ + all_results = self.syn.restGET( + "/projects/user/{principalId}".format(principalId=currentUserId) + ) + + while ( + "nextPageToken" in all_results + ): # iterate over next page token in results while there is any + results_token = self.syn.restGET( + "/projects/user/{principalId}?nextPageToken={nextPageToken}".format( + principalId=currentUserId, + nextPageToken=all_results["nextPageToken"], + ) + ) + all_results["results"].extend(results_token["results"]) + + if "nextPageToken" in results_token: + all_results["nextPageToken"] = results_token["nextPageToken"] + else: + del all_results["nextPageToken"] + + return all_results + + def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]: + """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute. + + Returns: + A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName). + """ + + # get the set of all storage Synapse project accessible for this pipeline + storageProjects = self.storageFileviewTable["projectId"].unique() + + # get the set of storage Synapse project accessible for this user + + # get current user name and user ID + currentUser = self.syn.getUserProfile() + currentUserName = currentUser.userName + currentUserId = currentUser.ownerId + + # get a list of projects from Synapse + currentUserProjects = self.getPaginatedRestResults(currentUserId) + + # prune results json filtering project id + currentUserProjects = [ + currentUserProject.get("id") + for currentUserProject in currentUserProjects["results"] + ] + + # find set of user projects that are also in this pipeline's storage projects set + storageProjects = list(set(storageProjects) & set(currentUserProjects)) + + # Limit projects to scope if specified + if project_scope: + storageProjects = list(set(storageProjects) & set(project_scope)) + + if not storageProjects: + raise Warning( + f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}" + ) + + # prepare a return list of project IDs and names + projects = [] + for projectId in storageProjects: + projectName = self.syn.get(projectId, downloadFile=False).name + projects.append((projectId, projectName)) + + sorted_projects_list = sorted(projects, key=lambda tup: tup[0]) + + return sorted_projects_list + + def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: + """Gets all datasets in folder under a given storage project that the current user has access to. + + Args: + projectId: synapse ID of a storage project. + + Returns: + A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). + None: If the projectId cannot be found on Synapse. + """ + + # select all folders and fetch their names from within the storage project; + # if folder content type is defined, only select folders that contain datasets + areDatasets = False + if "contentType" in self.storageFileviewTable.columns: + foldersTable = self.storageFileviewTable[ + (self.storageFileviewTable["contentType"] == "dataset") + & (self.storageFileviewTable["projectId"] == projectId) + ] + areDatasets = True + else: + foldersTable = self.storageFileviewTable[ + (self.storageFileviewTable["type"] == "folder") + & (self.storageFileviewTable["parentId"] == projectId) + ] + + # get an array of tuples (folderId, folderName) + # some folders are part of datasets; others contain datasets + # each dataset parent is the project; folders part of a dataset have another folder as a parent + # to get folders if and only if they contain datasets for each folder + # check if folder's parent is the project; if so that folder contains a dataset, + # unless the folder list has already been filtered to dataset folders based on contentType attribute above + + datasetList = [] + folderProperties = ["id", "name"] + for folder in list( + foldersTable[folderProperties].itertuples(index=False, name=None) + ): + datasetList.append(folder) + + sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0]) + + return sorted_dataset_list + + def getFilesInStorageDataset( + self, datasetId: str, fileNames: List = None, fullpath: bool = True + ) -> List[Tuple[str, str]]: + """Gets all files in a given dataset folder. + + Args: + datasetId: synapse ID of a storage dataset. + fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g. + metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present. + fullpath: if True return the full path as part of this filename; otherwise return just base filename + + Returns: + A list of files; the list consists of tuples (fileId, fileName). + + Raises: + ValueError: Dataset ID not found. + """ + # select all files within a given storage dataset folder (top level folder in a Synapse storage project or folder marked with contentType = 'dataset') + walked_path = synapseutils.walk( + self.syn, datasetId, includeTypes=["folder", "file"] + ) + + file_list = [] + + # iterate over all results + for dirpath, dirname, filenames in walked_path: + # iterate over all files in a folder + for filename in filenames: + if (not "manifest" in filename[0] and not fileNames) or ( + fileNames and filename[0] in fileNames + ): + # don't add manifest to list of files unless it is specified in the list of specified fileNames; return all found files + # except the manifest if no fileNames have been specified + # TODO: refactor for clarity/maintainability + + if fullpath: + # append directory path to filename + filename = (dirpath[0] + "/" + filename[0], filename[1]) + + # add file name file id tuple, rearranged so that id is first and name follows + file_list.append(filename[::-1]) + + return file_list + + def _get_manifest_id(self, manifest: pd.DataFrame) -> str: + """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one. + Args: + manifest: a dataframe contains name and id of manifests in a given asset view + + Return: + manifest_syn_id: id of a given censored or uncensored manifest + """ + censored_regex = re.compile(".*censored.*") + censored = manifest["name"].str.contains(censored_regex) + if any(censored): + # Try to use uncensored manifest first + not_censored = ~censored + if any(not_censored): + manifest_syn_id = manifest[not_censored]["id"].iloc[0] + # if only censored manifests are available, just use the first censored manifest + else: + manifest_syn_id = manifest["id"].iloc[0] + + # otherwise, use the first (implied only) version that exists + else: + manifest_syn_id = manifest["id"].iloc[0] + + return manifest_syn_id + + def getDatasetManifest( + self, + datasetId: str, + downloadFile: bool = False, + newManifestName: str = "", + ) -> Union[str, File]: + """Gets the manifest associated with a given dataset. + + Args: + datasetId: synapse ID of a storage dataset. + downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not. + newManifestName: new name of a manifest that gets downloaded + + Returns: + manifest_syn_id (String): Synapse ID of exisiting manifest file. + manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. + "" (String): No pre-exisiting manifest in dataset. + """ + manifest_data = "" + + # get a list of files containing the manifest for this dataset (if any) + all_files = self.storageFileviewTable + + # construct regex based on manifest basename in the config + manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv") + + # search manifest based on given manifest basename regex above + # and return a dataframe containing name and id of manifests in a given asset view + manifest = all_files[ + (all_files["name"].str.contains(manifest_re, regex=True)) + & (all_files["parentId"] == datasetId) + ] + + manifest = manifest[["id", "name"]] + + # if there is no pre-exisiting manifest in the specified dataset + if manifest.empty: + logger.warning( + f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}" + ) + return "" + + # if there is an exisiting manifest + else: + manifest_syn_id = self._get_manifest_id(manifest) + if downloadFile: + md = ManifestDownload(self.syn, manifest_id=manifest_syn_id) + manifest_data = ManifestDownload.download_manifest( + md, newManifestName=newManifestName, manifest_df=manifest + ) + ## TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string, + ## then we should catch the error here without returning an empty string. + if not manifest_data: + logger.debug( + f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}" + ) + return manifest_data + return manifest_syn_id + + def getDataTypeFromManifest(self, manifestId: str): + """Fetch a manifest and return data types of all columns + Args: + manifestId: synapse ID of a manifest + """ + # get manifest file path + manifest_filepath = self.syn.get(manifestId).path + + # load manifest dataframe + manifest = load_df( + manifest_filepath, + preserve_raw_input=False, + data_model=False, + ) + + # convert the dataFrame to use best possible dtypes. + manifest_new = manifest.convert_dtypes() + + # get data types of columns + result = manifest_new.dtypes.to_frame("dtypes").reset_index() + + # return the result as a dictionary + result_dict = result.set_index("index")["dtypes"].astype(str).to_dict() + + return result_dict + + def _get_files_metadata_from_dataset( + self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None + ) -> Optional[dict]: + """retrieve file ids under a particular datasetId + + Args: + datasetId (str): a dataset id + only_new_files (bool): if only adding new files that are not already exist + manifest (pd.DataFrame): metadata manifest dataframe. Default to None. + + Returns: + a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available + """ + dataset_files = self.getFilesInStorageDataset(datasetId) + if dataset_files: + dataset_file_names_id_dict = self._get_file_entityIds( + dataset_files, only_new_files=only_new_files, manifest=manifest + ) + return dataset_file_names_id_dict + else: + return None + + def add_entity_id_and_filename( + self, datasetId: str, manifest: pd.DataFrame + ) -> pd.DataFrame: + """add entityid and filename column to an existing manifest assuming entityId column is not already present + + Args: + datasetId (str): dataset syn id + manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty + + Returns: + pd.DataFrame: returns a pandas dataframe + """ + # get file names and entity ids of a given dataset + dataset_files_dict = self._get_files_metadata_from_dataset( + datasetId, only_new_files=False + ) + + if dataset_files_dict: + # turn manifest dataframe back to a dictionary for operation + manifest_dict = manifest.to_dict("list") + + # update Filename column + # add entityId column to the end + manifest_dict.update(dataset_files_dict) + + # if the component column exists in existing manifest, fill up that column + if "Component" in manifest_dict.keys(): + manifest_dict["Component"] = manifest_dict["Component"] * max( + 1, len(manifest_dict["Filename"]) + ) + + # turn dictionary back to a dataframe + manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index") + manifest_df_updated = manifest_df_index.transpose() + + # fill na with empty string + manifest_df_updated = manifest_df_updated.fillna("") + + # drop index + manifest_df_updated = manifest_df_updated.reset_index(drop=True) + + return manifest_df_updated + else: + return manifest + + def fill_in_entity_id_filename( + self, datasetId: str, manifest: pd.DataFrame + ) -> Tuple[List, pd.DataFrame]: + """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present. + + Args: + datasetId (str): dataset syn id + manifest (pd.DataFrame): existing manifest dataframe. + + Returns: + Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe + """ + # get dataset file names and entity id as a list of tuple + dataset_files = self.getFilesInStorageDataset(datasetId) + + # update manifest with additional filenames, if any + # note that if there is an existing manifest and there are files in the dataset + # the columns Filename and entityId are assumed to be present in manifest schema + # TODO: use idiomatic panda syntax + if dataset_files: + new_files = self._get_file_entityIds( + dataset_files=dataset_files, only_new_files=True, manifest=manifest + ) + + # update manifest so that it contains new dataset files + new_files = pd.DataFrame(new_files) + manifest = ( + pd.concat([manifest, new_files], sort=False) + .reset_index() + .drop("index", axis=1) + ) + + manifest = manifest.fillna("") + return dataset_files, manifest + + def updateDatasetManifestFiles( + self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True + ) -> Union[Tuple[str, pd.DataFrame], None]: + """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any. + + Args: + dmge: DataModelGraphExplorer Instance + datasetId: synapse ID of a storage dataset. + store: if set to True store updated manifest in asset store; if set to False + return a Pandas dataframe containing updated manifest but do not store to asset store + + + Returns: + Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. + If there is no existing manifest return None + """ + + # get existing manifest Synapse ID + manifest_id = self.getDatasetManifest(datasetId) + + # if there is no manifest return None + if not manifest_id: + return None + + manifest_filepath = self.syn.get(manifest_id).path + manifest = load_df(manifest_filepath) + + # update manifest with additional filenames, if any + # note that if there is an existing manifest and there are files in the dataset + # the columns Filename and entityId are assumed to be present in manifest schema + # TODO: use idiomatic panda syntax + + dataset_files, manifest = self.fill_in_entity_id_filename(datasetId, manifest) + if dataset_files: + # update the manifest file, so that it contains the relevant entity IDs + if store: + manifest.to_csv(manifest_filepath, index=False) + + # store manifest and update associated metadata with manifest on Synapse + manifest_id = self.associateMetadataWithFiles( + dmge, manifest_filepath, datasetId + ) + + return manifest_id, manifest + + def _get_file_entityIds( + self, + dataset_files: List, + only_new_files: bool = False, + manifest: pd.DataFrame = None, + ): + """ + Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files + + Args: + manifest: metadata manifest + dataset_file: List of all files in a dataset + only_new_files: boolean to control whether only new files are returned or all files in the dataset + Returns: + files: dictionary of file names and entityIDs, with scope as specified by `only_new_files` + """ + files = {"Filename": [], "entityId": []} + + if only_new_files: + if manifest is None: + raise UnboundLocalError( + "No manifest was passed in, a manifest is required when `only_new_files` is True." + ) + + # find new files (that are not in the current manifest) if any + for file_id, file_name in dataset_files: + if not file_id in manifest["entityId"].values: + files["Filename"].append(file_name) + files["entityId"].append(file_id) + else: + # get all files + for file_id, file_name in dataset_files: + files["Filename"].append(file_name) + files["entityId"].append(file_id) + + return files + + def getProjectManifests( + self, projectId: str + ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: + """Gets all metadata manifest files across all datasets in a specified project. + + Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest + as a list of tuples, one for each manifest: + [ + ( + (datasetId, dataName), + (manifestId, manifestName), + (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema + ), + ... + ] + + TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface + """ + component = None + entity = None + manifests = [] + + datasets = self.getStorageDatasetsInProject(projectId) + + for datasetId, datasetName in datasets: + # encode information about the manifest in a simple list (so that R clients can unpack it) + # eventually can serialize differently + + # Get synID of manifest for a dataset + manifestId = self.getDatasetManifest(datasetId) + + # If a manifest exists, get the annotations for it, else return base 'manifest' tuple + if manifestId: + annotations = self.getFileAnnotations(manifestId) + + # If manifest has annotations specifying component, use that + if annotations and "Component" in annotations: + component = annotations["Component"] + entity = self.syn.get(manifestId, downloadFile=False) + manifest_name = entity["properties"]["name"] + + # otherwise download the manifest and parse for information + elif not annotations or "Component" not in annotations: + logging.debug( + f"No component annotations have been found for manifest {manifestId}. " + "The manifest will be downloaded and parsed instead. " + "For increased speed, add component annotations to manifest." + ) + + manifest_info = self.getDatasetManifest( + datasetId, downloadFile=True + ) + manifest_name = manifest_info["properties"].get("name", "") + + if not manifest_name: + logger.error(f"Failed to download manifests from {datasetId}") + + manifest_path = manifest_info["path"] + + manifest_df = load_df(manifest_path) + + # Get component from component column if it exists + if ( + "Component" in manifest_df + and not manifest_df["Component"].empty + ): + list(set(manifest_df["Component"])) + component = list(set(manifest_df["Component"])) + + # Added to address issues raised during DCA testing + if "" in component: + component.remove("") + + if len(component) == 1: + component = component[0] + elif len(component) > 1: + logging.warning( + f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time." + "Behavior of manifests with multiple components is undefined" + ) + else: + manifest_name = "" + component = None + if component: + manifest = ( + (datasetId, datasetName), + (manifestId, manifest_name), + (component, component), + ) + elif manifestId: + logging.debug( + f"Manifest {manifestId} does not have an associated Component" + ) + manifest = ( + (datasetId, datasetName), + (manifestId, manifest_name), + ("", ""), + ) + else: + manifest = ( + (datasetId, datasetName), + ("", ""), + ("", ""), + ) + + if manifest: + manifests.append(manifest) + + return manifests + + def upload_project_manifests_to_synapse( + self, dmge: DataModelGraphExplorer, projectId: str + ) -> List[str]: + """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse. + + Returns: String of all the manifest_table_ids of all the manifests that have been loaded. + """ + + manifests = [] + manifest_loaded = [] + datasets = self.getStorageDatasetsInProject(projectId) + + for datasetId, datasetName in datasets: + # encode information about the manifest in a simple list (so that R clients can unpack it) + # eventually can serialize differently + + manifest = ((datasetId, datasetName), ("", ""), ("", "")) + + manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) + if manifest_info: + manifest_id = manifest_info["properties"]["id"] + manifest_name = manifest_info["properties"]["name"] + manifest_path = manifest_info["path"] + manifest_df = load_df(manifest_path) + manifest_table_id = uploadDB( + dmge=dmge, + manifest=manifest, + datasetId=datasetId, + table_name=datasetName, + ) + manifest_loaded.append(datasetName) + return manifest_loaded + + def upload_annotated_project_manifests_to_synapse( + self, projectId: str, path_to_json_ld: str, dry_run: bool = False + ) -> List[str]: + """ + Purpose: + For all manifests in a project, upload them as a table and add annotations manifest csv. + Assumes the manifest is already present as a CSV in a dataset in the project. + + """ + # Instantiate DataModelParser + data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld) + # Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + # Instantiate DataModelGraphExplorer + dmge = DataModelGraphExplorer(graph_data_model) + + manifests = [] + manifest_loaded = [] + datasets = self.getStorageDatasetsInProject(projectId) + for datasetId, datasetName in datasets: + # encode information about the manifest in a simple list (so that R clients can unpack it) + # eventually can serialize differently + + manifest = ((datasetId, datasetName), ("", ""), ("", "")) + manifests.append(manifest) + + manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) + + if manifest_info: + manifest_id = manifest_info["properties"]["id"] + manifest_name = manifest_info["properties"]["name"] + manifest_path = manifest_info["path"] + manifest = ( + (datasetId, datasetName), + (manifest_id, manifest_name), + ("", ""), + ) + if not dry_run: + manifest_syn_id = self.associateMetadataWithFiles( + dmge, manifest_path, datasetId, manifest_record_type="table" + ) + manifest_loaded.append(manifest) + + return manifests, manifest_loaded + + def move_entities_to_new_project( + self, + projectId: str, + newProjectId: str, + returnEntities: bool = False, + dry_run: bool = False, + ): + """ + For each manifest csv in a project, look for all the entitiy ids that are associated. + Look up the entitiy in the files, move the entity to new project. + """ + + manifests = [] + manifest_loaded = [] + datasets = self.getStorageDatasetsInProject(projectId) + if datasets: + for datasetId, datasetName in datasets: + # encode information about the manifest in a simple list (so that R clients can unpack it) + # eventually can serialize differently + + manifest = ((datasetId, datasetName), ("", ""), ("", "")) + manifests.append(manifest) + + manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) + if manifest_info: + manifest_id = manifest_info["properties"]["id"] + manifest_name = manifest_info["properties"]["name"] + manifest_path = manifest_info["path"] + manifest_df = load_df(manifest_path) + + manifest = ( + (datasetId, datasetName), + (manifest_id, manifest_name), + ("", ""), + ) + manifest_loaded.append(manifest) + + annotation_entities = self.storageFileviewTable[ + (self.storageFileviewTable["id"].isin(manifest_df["entityId"])) + & (self.storageFileviewTable["type"] == "folder") + ]["id"] + + if returnEntities: + for entityId in annotation_entities: + if not dry_run: + self.syn.move(entityId, datasetId) + else: + logging.info( + f"{entityId} will be moved to folder {datasetId}." + ) + else: + # generate project folder + archive_project_folder = Folder( + projectId + "_archive", parent=newProjectId + ) + archive_project_folder = self.syn.store(archive_project_folder) + + # generate dataset folder + dataset_archive_folder = Folder( + "_".join([datasetId, datasetName, "archive"]), + parent=archive_project_folder.id, + ) + dataset_archive_folder = self.syn.store(dataset_archive_folder) + + for entityId in annotation_entities: + # move entities to folder + if not dry_run: + self.syn.move(entityId, dataset_archive_folder.id) + else: + logging.info( + f"{entityId} will be moved to folder {dataset_archive_folder.id}." + ) + else: + raise LookupError( + f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry." + ) + return manifests, manifest_loaded + + def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]: + """Download synapse table as a pd dataframe; return table schema and etags as results too + + Args: + synapse_id: synapse ID of the table to query + """ + + results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) + df = results.asDataFrame(rowIdAndVersionInIndex=False) + + return df, results + + def _get_tables(self, datasetId: str = None, projectId: str = None) -> List[Table]: + if projectId: + project = projectId + elif datasetId: + project = self.syn.get(self.getDatasetProject(datasetId)) + + return list(self.syn.getChildren(project, includeTypes=["table"])) + + def get_table_info(self, datasetId: str = None, projectId: str = None) -> List[str]: + """Gets the names of the tables in the schema + Can pass in a synID for a dataset or project + Returns: + list[str]: A list of table names + """ + tables = self._get_tables(datasetId=datasetId, projectId=projectId) + if tables: + return {table["name"]: table["id"] for table in tables} + else: + return {None: None} + + @missing_entity_handler + def uploadDB( + self, + dmge: DataModelGraphExplorer, + manifest: pd.DataFrame, + datasetId: str, + table_name: str, + restrict: bool = False, + table_manipulation: str = "replace", + table_column_names: str = "class_label", + ): + """ + Method to upload a database to an asset store. In synapse, this will upload a metadata table + + Args: + dmge: DataModelGraphExplorer object + manifest: pd.Df manifest to upload + datasetId: synID of the dataset for the manifest + table_name: name of the table to be uploaded + restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions + existingTableId: str of the synId of the existing table, if one already exists + table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) + table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display + name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain + display label formatting. + Returns: + manifest_table_id: synID of the uploaded table + manifest: the original manifset + table_manifest: manifest formatted appropriately for the table + + """ + + col_schema, table_manifest = self.formatDB( + dmge=dmge, manifest=manifest, table_column_names=table_column_names + ) + + manifest_table_id = self.buildDB( + datasetId, + table_name, + col_schema, + table_manifest, + table_manipulation, + dmge, + restrict, + ) + + return manifest_table_id, manifest, table_manifest + + def formatDB(self, dmge, manifest, table_column_names): + """ + Method to format a manifest appropriatly for upload as table + + Args: + dmge: DataModelGraphExplorer object + manifest: pd.Df manifest to upload + table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display + name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain + display label formatting. + Returns: + col_schema: schema for table columns: type, size, etc + table_manifest: formatted manifest + + """ + # Rename the manifest columns to display names to match fileview + + blacklist_chars = ["(", ")", ".", " ", "-"] + manifest_columns = manifest.columns.tolist() + + table_manifest = deepcopy(manifest) + + if table_column_names == "display_name": + cols = table_manifest.columns + + elif table_column_names == "display_label": + cols = [ + str(col).translate({ord(x): "" for x in blacklist_chars}) + for col in manifest_columns + ] + + elif table_column_names == "class_label": + cols = [ + get_class_label_from_display_name(str(col)).translate( + {ord(x): "" for x in blacklist_chars} + ) + for col in manifest_columns + ] + else: + ValueError( + f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only." + ) + + cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols)) + + # Reset column names in table manifest + table_manifest.columns = cols + + # move entity id to end of df + entity_col = table_manifest.pop("entityId") + table_manifest.insert(len(table_manifest.columns), "entityId", entity_col) + + # Get the column schema + col_schema = as_table_columns(table_manifest) + + # Set Id column length to 64 (for some reason not being auto set.) + for i, col in enumerate(col_schema): + if col["name"].lower() == "id": + col_schema[i]["maximumSize"] = 64 + + return col_schema, table_manifest + + def buildDB( + self, + datasetId: str, + table_name: str, + col_schema: List, + table_manifest: pd.DataFrame, + table_manipulation: str, + dmge: DataModelGraphExplorer, + restrict: bool = False, + ): + """ + Method to construct the table appropriately: create new table, replace existing, or upsert new into existing + Calls TableOperations class to execute + + Args: + datasetId: synID of the dataset for the manifest + table_name: name of the table to be uploaded + col_schema: schema for table columns: type, size, etc from `formatDB` + table_manifest: formatted manifest that can be uploaded as a table + table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) + restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions + + Returns: + manifest_table_id: synID of the uploaded table + + """ + table_info = self.get_table_info(datasetId=datasetId) + # Put table manifest onto synapse + schema = Schema( + name=table_name, + columns=col_schema, + parent=self.getDatasetProject(datasetId), + ) + + if table_name in table_info: + existingTableId = table_info[table_name] + else: + existingTableId = None + + tableOps = TableOperations( + synStore=self, + tableToLoad=table_manifest, + tableName=table_name, + datasetId=datasetId, + existingTableId=existingTableId, + restrict=restrict, + ) + + if not table_manipulation or table_name not in table_info.keys(): + manifest_table_id = tableOps.createTable( + columnTypeDict=col_schema, + specifySchema=True, + ) + elif table_name in table_info.keys() and table_info[table_name]: + if table_manipulation.lower() == "replace": + manifest_table_id = tableOps.replaceTable( + specifySchema=True, + columnTypeDict=col_schema, + ) + elif table_manipulation.lower() == "upsert": + manifest_table_id = tableOps.upsertTable( + dmge=dmge, + ) + elif table_manipulation.lower() == "update": + manifest_table_id = tableOps.updateTable() + + if table_manipulation and table_manipulation.lower() == "upsert": + existing_tables = self.get_table_info(datasetId=datasetId) + tableId = existing_tables[table_name] + annos = self.syn.get_annotations(tableId) + annos["primary_key"] = table_manifest["Component"][0] + "_id" + annos = self.syn.set_annotations(annos) + + return manifest_table_id + + def upload_manifest_file( + self, + manifest, + metadataManifestPath, + datasetId, + restrict_manifest, + component_name="", + ): + # Update manifest to have the new entityId column + manifest.to_csv(metadataManifestPath, index=False) + + # store manifest to Synapse as a CSV + # update file name + file_name_full = metadataManifestPath.split("/")[-1] + file_extension = file_name_full.split(".")[-1] + + # Differentiate "censored" and "uncensored" manifest + if "censored" in file_name_full: + file_name_new = ( + os.path.basename(CONFIG.synapse_manifest_basename) + + "_" + + component_name + + "_censored" + + "." + + file_extension + ) + else: + file_name_new = ( + os.path.basename(CONFIG.synapse_manifest_basename) + + "_" + + component_name + + "." + + file_extension + ) + + manifestSynapseFile = File( + metadataManifestPath, + description="Manifest for dataset " + datasetId, + parent=datasetId, + name=file_name_new, + ) + manifest_synapse_file_id = self.syn.store( + manifestSynapseFile, isRestricted=restrict_manifest + ).id + + synapseutils.copy_functions.changeFileMetaData( + syn=self.syn, + entity=manifest_synapse_file_id, + downloadAs=file_name_new, + forceVersion=False, + ) + + return manifest_synapse_file_id + + @missing_entity_handler + def format_row_annotations( + self, dmge, row, entityId: str, hideBlanks: bool, annotation_keys: str + ): + # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis) + # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest + # this could create a divergence between manifest column and annotations. this should be ok for most use cases. + # columns with special characters are outside of the schema + metadataSyn = {} + blacklist_chars = ["(", ")", ".", " ", "-"] + + for k, v in row.to_dict().items(): + if annotation_keys == "display_label": + keySyn = str(k).translate({ord(x): "" for x in blacklist_chars}) + elif annotation_keys == "class_label": + keySyn = get_class_label_from_display_name(str(k)).translate( + {ord(x): "" for x in blacklist_chars} + ) + + # Skip `Filename` and `ETag` columns when setting annotations + if keySyn in ["Filename", "ETag", "eTag"]: + continue + + # truncate annotation values to 500 characters if the + # size of values is greater than equal to 500 characters + # add an explicit [truncatedByDataCuratorApp] message at the end + # of every truncated message to indicate that the cell value + # has been truncated + if isinstance(v, str) and len(v) >= 500: + v = v[0:472] + "[truncatedByDataCuratorApp]" + + metadataSyn[keySyn] = v + # set annotation(s) for the various objects/items in a dataset on Synapse + annos = self.syn.get_annotations(entityId) + csv_list_regex = comma_separated_list_regex() + for anno_k, anno_v in metadataSyn.items(): + # Remove keys with nan or empty string values from dict of annotations to be uploaded + # if present on current data annotation + if hideBlanks and ( + anno_v == "" or (isinstance(anno_v, float) and np.isnan(anno_v)) + ): + annos.pop(anno_k) if anno_k in annos.keys() else annos + # Otherwise save annotation as approrpriate + else: + if isinstance(anno_v, float) and np.isnan(anno_v): + annos[anno_k] = "" + elif ( + isinstance(anno_v, str) + and re.fullmatch(csv_list_regex, anno_v) + and rule_in_rule_list( + "list", dmge.get_node_validation_rules(anno_k) + ) + ): + annos[anno_k] = anno_v.split(",") + else: + annos[anno_k] = anno_v + + return annos + + @missing_entity_handler + def format_manifest_annotations(self, manifest, manifest_synapse_id): + """ + Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. + For now just getting the Component. + """ + + entity = self.syn.get(manifest_synapse_id, downloadFile=False) + is_file = entity.concreteType.endswith(".FileEntity") + is_table = entity.concreteType.endswith(".TableEntity") + + if is_file: + # Get file metadata + metadata = self.getFileAnnotations(manifest_synapse_id) + + # If there is a defined component add it to the metadata. + if "Component" in manifest.columns: + # Gather component information + component = manifest["Component"].unique() + + # Double check that only a single component is listed, else raise an error. + try: + len(component) == 1 + except ValueError as err: + raise ValueError( + f"Manifest has more than one component. Please check manifest and resubmit." + ) from err + + # Add component to metadata + metadata["Component"] = component[0] + + elif is_table: + # Get table metadata + metadata = self.getTableAnnotations(manifest_synapse_id) + + # Get annotations + annos = self.syn.get_annotations(manifest_synapse_id) + + # Add metadata to the annotations + for annos_k, annos_v in metadata.items(): + annos[annos_k] = annos_v + + return annos + + ''' + def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath, + useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False): + """ + Purpose: + Works very similarly to associateMetadataWithFiles except takes in the manifest + rather than the manifest path + + """ + + # Add uuid for table updates and fill. + if not "Uuid" in manifest.columns: + manifest["Uuid"] = '' + + for idx,row in manifest.iterrows(): + if not row["Uuid"]: + gen_uuid = uuid.uuid4() + row["Uuid"] = gen_uuid + manifest.loc[idx, 'Uuid'] = gen_uuid + + # add entityId as a column if not already there or + # fill any blanks with an empty string. + if not "entityId" in manifest.columns: + manifest["entityId"] = "" + else: + manifest["entityId"].fillna("", inplace=True) + + # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations + dmge = DataModelGraphExplorer() + + # Create table name here. + if 'Component' in manifest.columns: + table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table' + else: + table_name = 'synapse_storage_manifest_table' + + # Upload manifest as a table and get the SynID and manifest + manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table( + dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,) + + # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed + # also set metadata for each synapse entity as Synapse annotations + for idx, row in manifest.iterrows(): + if not row["entityId"]: + # If not using entityIds, fill with manifest_table_id so + row["entityId"] = manifest_synapse_table_id + entityId = '' + else: + # get the entity id corresponding to this row + entityId = row["entityId"] + + # Load manifest to synapse as a CSV File + manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest) + + # Get annotations for the file manifest. + manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id) + + self.syn.set_annotations(manifest_annotations) + + logger.info("Associated manifest file with dataset on Synapse.") + + # Update manifest Synapse table with new entity id column. + self.make_synapse_table( + table_to_load = table_manifest, + dataset_id = datasetId, + existingTableId = manifest_synapse_table_id, + table_name = table_name, + update_col = 'Uuid', + specify_schema = False, + ) + + # Get annotations for the table manifest + manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id) + self.syn.set_annotations(manifest_annotations) + return manifest_synapse_table_id + ''' + + def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame: + """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing. + Args: + metadataManifestPath (str): path where manifest is stored + Returns: + manifest(pd.DataFrame): Manifest loaded as a pandas dataframe + Raises: + FileNotFoundError: Manifest file does not exist at provided path. + """ + # read new manifest csv + try: + load_args = { + "dtype": "string", + } + manifest = load_df( + metadataManifestPath, + preserve_raw_input=False, + allow_na_values=False, + **load_args, + ) + except FileNotFoundError as err: + raise FileNotFoundError( + f"No manifest file was found at this path: {metadataManifestPath}" + ) from err + return manifest + + def _add_id_columns_to_manifest( + self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer + ): + """Helper function to add id and entityId columns to the manifest if they do not already exist, Fill id values per row. + Args: + Manifest loaded as a pd.Dataframe + Returns (pd.DataFrame): + Manifest df with new Id and EntityId columns (and UUID values) if they were not already present. + """ + + # Add Id for table updates and fill. + if not col_in_dataframe("Id", manifest): + # See if schema has `Uuid` column specified + try: + uuid_col_in_schema = dmge.is_class_in_schema( + "Uuid" + ) or dmge.is_class_in_schema("uuid") + except KeyError: + uuid_col_in_schema = False + + # Rename `Uuid` column if it wasn't specified in the schema + if col_in_dataframe("Uuid", manifest) and not uuid_col_in_schema: + manifest.rename(columns={"Uuid": "Id"}, inplace=True) + # If no `Uuid` column exists or it is specified in the schema, create a new `Id` column + else: + manifest["Id"] = "" + + # Retrieve the ID column name (id, Id and ID) are treated the same. + id_col_name = [col for col in manifest.columns if col.lower() == "id"][0] + + # Check if values have been added to the Id coulumn, if not add a UUID so value in the row is not blank. + for idx, row in manifest.iterrows(): + if not row[id_col_name]: + gen_uuid = str(uuid.uuid4()) + row[id_col_name] = gen_uuid + manifest.loc[idx, id_col_name] = gen_uuid + + # add entityId as a column if not already there or + # fill any blanks with an empty string. + if not col_in_dataframe("entityId", manifest): + manifest["entityId"] = "" + else: + manifest["entityId"].fillna("", inplace=True) + + return manifest + + def _generate_table_name(self, manifest): + """Helper function to generate a table name for upload to synapse. + Args: + Manifest loaded as a pd.Dataframe + Returns: + table_name (str): Name of the table to load + component_name (str): Name of the manifest component (if applicable) + """ + # Create table name here. + if "Component" in manifest.columns: + component_name = manifest["Component"][0].lower() + table_name = component_name + "_synapse_storage_manifest_table" + else: + component_name = "" + table_name = "synapse_storage_manifest_table" + return table_name, component_name + + def _add_annotations( + self, + dmge, + row, + entityId: str, + hideBlanks: bool, + annotation_keys: str, + ): + """Helper function to format and add annotations to entities in Synapse. + Args: + dmge: DataModelGraphExplorer object, + row: current row of manifest being processed + entityId (str): synapseId of entity to add annotations to + hideBlanks: Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. + annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display + name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain + display label formatting while ensuring the label is formatted properly for Synapse annotations. + Returns: + Annotations are added to entities in Synapse, no return. + """ + # Format annotations for Synapse + annos = self.format_row_annotations( + dmge, row, entityId, hideBlanks, annotation_keys + ) + + if annos: + # Store annotations for an entity folder + self.syn.set_annotations(annos) + return + + def _create_entity_id(self, idx, row, manifest, datasetId): + """Helper function to generate an entityId and add it to the appropriate row in the manifest. + Args: + row: current row of manifest being processed + manifest (pd.DataFrame): loaded df containing user supplied data. + datasetId (str): synapse ID of folder containing the dataset + + Returns: + manifest (pd.DataFrame): manifest with entityId added to the appropriate row + entityId (str): Generated Entity Id. + + """ + rowEntity = Folder(str(uuid.uuid4()), parent=datasetId) + rowEntity = self.syn.store(rowEntity) + entityId = rowEntity["id"] + row["entityId"] = entityId + manifest.loc[idx, "entityId"] = entityId + return manifest, entityId + + def add_annotations_to_entities_files( + self, + dmge, + manifest, + manifest_record_type: str, + datasetId: str, + hideBlanks: bool, + manifest_synapse_table_id="", + annotation_keys: str = "class_label", + ): + """Depending on upload type add Ids to entityId row. Add anotations to connected files. + Args: + dmge: DataModelGraphExplorer Object + manifest (pd.DataFrame): loaded df containing user supplied data. + manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. + datasetId (str): synapse ID of folder containing the dataset + hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. + manifest_synapse_table_id (str): Default is an empty string ''. + annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display + name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain + display label formatting while ensuring the label is formatted properly for Synapse annotations. + Returns: + manifest (pd.DataFrame): modified to add entitiyId as appropriate + + """ + + # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting + if "filename" in [col.lower() for col in manifest.columns]: + # get current list of files and store as dataframe + dataset_files = self.getFilesInStorageDataset(datasetId) + files_and_entityIds = self._get_file_entityIds( + dataset_files=dataset_files, only_new_files=False + ) + file_df = pd.DataFrame(files_and_entityIds) + + # Merge dataframes to add entityIds + manifest = manifest.merge( + file_df, how="left", on="Filename", suffixes=["_x", None] + ).drop("entityId_x", axis=1) + + # Fill `entityId` for each row if missing and annotate entity as appropriate + for idx, row in manifest.iterrows(): + if not row["entityId"] and ( + manifest_record_type == "file_and_entities" + or manifest_record_type == "table_file_and_entities" + ): + manifest, entityId = self._create_entity_id( + idx, row, manifest, datasetId + ) + elif not row["entityId"] and manifest_record_type == "table_and_file": + # If not using entityIds, fill with manifest_table_id so + row["entityId"] = manifest_synapse_table_id + manifest.loc[idx, "entityId"] = manifest_synapse_table_id + entityId = "" + else: + # get the file id of the file to annotate, collected in above step. + entityId = row["entityId"] + + # Adding annotations to connected files. + if entityId: + self._add_annotations(dmge, row, entityId, hideBlanks, annotation_keys) + logger.info(f"Added annotations to entity: {entityId}") + return manifest + + def upload_manifest_as_table( + self, + dmge: DataModelGraphExplorer, + manifest: pd.DataFrame, + metadataManifestPath: str, + datasetId: str, + table_name: str, + component_name: str, + restrict: bool, + manifest_record_type: str, + hideBlanks: bool, + table_manipulation: str, + table_column_names: str, + annotation_keys: str, + file_annotations_upload: bool = True, + ): + """Upload manifest to Synapse as a table and csv. + Args: + dmge: DataModelGraphExplorer object + manifest (pd.DataFrame): loaded df containing user supplied data. + metadataManifestPath: path to csv containing a validated metadata manifest. + datasetId (str): synapse ID of folder containing the dataset + table_name (str): Generated to name the table being uploaded. + component_name (str): Name of the component manifest that is currently being uploaded. + restrict (bool): Flag for censored data. + manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. + hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. + table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. + table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display + name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain + display label formatting. + annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display + name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain + display label formatting while ensuring the label is formatted properly for Synapse annotations. + file_annotations_upload (bool): Default to True. If false, do not add annotations to files. + Return: + manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. + """ + # Upload manifest as a table, get the ID and updated manifest. + manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( + dmge=dmge, + manifest=manifest, + datasetId=datasetId, + table_name=table_name, + restrict=restrict, + table_manipulation=table_manipulation, + table_column_names=table_column_names, + ) + + if file_annotations_upload: + manifest = self.add_annotations_to_entities_files( + dmge, + manifest, + manifest_record_type, + datasetId, + hideBlanks, + manifest_synapse_table_id, + annotation_keys, + ) + # Load manifest to synapse as a CSV File + manifest_synapse_file_id = self.upload_manifest_file( + manifest, + metadataManifestPath, + datasetId, + restrict, + component_name=component_name, + ) + + # Set annotations for the file manifest. + manifest_annotations = self.format_manifest_annotations( + manifest, manifest_synapse_file_id + ) + self.syn.set_annotations(manifest_annotations) + logger.info("Associated manifest file with dataset on Synapse.") + + # Update manifest Synapse table with new entity id column. + manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( + dmge=dmge, + manifest=manifest, + datasetId=datasetId, + table_name=table_name, + restrict=restrict, + table_manipulation="update", + table_column_names=table_column_names, + ) + + # Set annotations for the table manifest + manifest_annotations = self.format_manifest_annotations( + manifest, manifest_synapse_table_id + ) + self.syn.set_annotations(manifest_annotations) + return manifest_synapse_file_id + + def upload_manifest_as_csv( + self, + dmge, + manifest, + metadataManifestPath, + datasetId, + restrict, + manifest_record_type, + hideBlanks, + component_name, + annotation_keys: str, + file_annotations_upload: bool = True, + ): + """Upload manifest to Synapse as a csv only. + Args: + dmge: DataModelGraphExplorer object + manifest (pd.DataFrame): loaded df containing user supplied data. + metadataManifestPath: path to csv containing a validated metadata manifest. + datasetId (str): synapse ID of folder containing the dataset + restrict (bool): Flag for censored data. + manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. + hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. + annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display + name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain + display label formatting while ensuring the label is formatted properly for Synapse annotations. + file_annotations_upload (bool): Default to True. If false, do not add annotations to files. + Return: + manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. + """ + if file_annotations_upload: + manifest = self.add_annotations_to_entities_files( + dmge, + manifest, + manifest_record_type, + datasetId, + hideBlanks, + annotation_keys=annotation_keys, + ) + + # Load manifest to synapse as a CSV File + manifest_synapse_file_id = self.upload_manifest_file( + manifest, + metadataManifestPath, + datasetId, + restrict, + component_name=component_name, + ) + + # Set annotations for the file manifest. + manifest_annotations = self.format_manifest_annotations( + manifest, manifest_synapse_file_id + ) + self.syn.set_annotations(manifest_annotations) + + logger.info("Associated manifest file with dataset on Synapse.") + + return manifest_synapse_file_id + + def upload_manifest_combo( + self, + dmge, + manifest, + metadataManifestPath, + datasetId, + table_name, + component_name, + restrict, + manifest_record_type, + hideBlanks, + table_manipulation, + table_column_names: str, + annotation_keys: str, + file_annotations_upload: bool = True, + ): + """Upload manifest to Synapse as a table and CSV with entities. + Args: + dmge: DataModelGraphExplorer object + manifest (pd.DataFrame): loaded df containing user supplied data. + metadataManifestPath: path to csv containing a validated metadata manifest. + datasetId (str): synapse ID of folder containing the dataset + table_name (str): Generated to name the table being uploaded. + component_name (str): Name of the component manifest that is currently being uploaded. + restrict (bool): Flag for censored data. + manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. + hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. + table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. + table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display + name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain + display label formatting. + annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display + name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain + display label formatting while ensuring the label is formatted properly for Synapse annotations. + file_annotations_upload (bool): Default to True. If false, do not add annotations to files. + Return: + manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. + """ + manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( + dmge=dmge, + manifest=manifest, + datasetId=datasetId, + table_name=table_name, + restrict=restrict, + table_manipulation=table_manipulation, + table_column_names=table_column_names, + ) + + if file_annotations_upload: + manifest = self.add_annotations_to_entities_files( + dmge, + manifest, + manifest_record_type, + datasetId, + hideBlanks, + manifest_synapse_table_id, + annotation_keys=annotation_keys, + ) + + # Load manifest to synapse as a CSV File + manifest_synapse_file_id = self.upload_manifest_file( + manifest, metadataManifestPath, datasetId, restrict, component_name + ) + + # Set annotations for the file manifest. + manifest_annotations = self.format_manifest_annotations( + manifest, manifest_synapse_file_id + ) + self.syn.set_annotations(manifest_annotations) + logger.info("Associated manifest file with dataset on Synapse.") + + # Update manifest Synapse table with new entity id column. + manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( + dmge=dmge, + manifest=manifest, + datasetId=datasetId, + table_name=table_name, + restrict=restrict, + table_manipulation="update", + table_column_names=table_column_names, + ) + + # Set annotations for the table manifest + manifest_annotations = self.format_manifest_annotations( + manifest, manifest_synapse_table_id + ) + self.syn.set_annotations(manifest_annotations) + return manifest_synapse_file_id + + def associateMetadataWithFiles( + self, + dmge: DataModelGraphExplorer, + metadataManifestPath: str, + datasetId: str, + manifest_record_type: str = "table_file_and_entities", + hideBlanks: bool = False, + restrict_manifest=False, + table_manipulation: str = "replace", + table_column_names: str = "class_label", + annotation_keys: str = "class_label", + file_annotations_upload: bool = True, + ) -> str: + """Associate metadata with files in a storage dataset already on Synapse. + Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file. + + If this is a new manifest there could be no Synapse entities associated with the rows of this manifest + this may be due to data type (e.g. clinical data) being tabular + and not requiring files; to utilize uniform interfaces downstream + (i.e. fileviews), a Synapse entity (a folder) is created for each row + and an entity column is added to the manifest containing the resulting + entity IDs; a table is also created at present as an additional interface + for downstream query and interaction with the data. + + Args: + dmge: DataModelGraphExplorer Object + metadataManifestPath: path to csv containing a validated metadata manifest. + The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type. + Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item. + In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file. + datasetId: synapse ID of folder containing the dataset + manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination. + hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. + restrict_manifest (bool): Default is false. Flag for censored data. + table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. + table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display + name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain + display label formatting. + annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display + name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain + display label formatting while ensuring the label is formatted properly for Synapse annotations. + Returns: + manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. + """ + # Read new manifest CSV: + manifest = self._read_manifest(metadataManifestPath) + manifest = self._add_id_columns_to_manifest(manifest, dmge) + + table_name, component_name = self._generate_table_name(manifest) + + # Upload manifest to synapse based on user input (manifest_record_type) + if manifest_record_type == "file_only": + manifest_synapse_file_id = self.upload_manifest_as_csv( + dmge, + manifest, + metadataManifestPath, + datasetId=datasetId, + restrict=restrict_manifest, + hideBlanks=hideBlanks, + manifest_record_type=manifest_record_type, + component_name=component_name, + annotation_keys=annotation_keys, + file_annotations_upload=file_annotations_upload, + ) + elif manifest_record_type == "table_and_file": + manifest_synapse_file_id = self.upload_manifest_as_table( + dmge, + manifest, + metadataManifestPath, + datasetId=datasetId, + table_name=table_name, + component_name=component_name, + restrict=restrict_manifest, + hideBlanks=hideBlanks, + manifest_record_type=manifest_record_type, + table_manipulation=table_manipulation, + table_column_names=table_column_names, + annotation_keys=annotation_keys, + file_annotations_upload=file_annotations_upload, + ) + elif manifest_record_type == "file_and_entities": + manifest_synapse_file_id = self.upload_manifest_as_csv( + dmge, + manifest, + metadataManifestPath, + datasetId=datasetId, + restrict=restrict_manifest, + hideBlanks=hideBlanks, + manifest_record_type=manifest_record_type, + component_name=component_name, + annotation_keys=annotation_keys, + file_annotations_upload=file_annotations_upload, + ) + elif manifest_record_type == "table_file_and_entities": + manifest_synapse_file_id = self.upload_manifest_combo( + dmge, + manifest, + metadataManifestPath, + datasetId=datasetId, + table_name=table_name, + component_name=component_name, + restrict=restrict_manifest, + hideBlanks=hideBlanks, + manifest_record_type=manifest_record_type, + table_manipulation=table_manipulation, + table_column_names=table_column_names, + annotation_keys=annotation_keys, + file_annotations_upload=file_annotations_upload, + ) + else: + raise ValueError("Please enter a valid manifest_record_type.") + return manifest_synapse_file_id + + def getTableAnnotations(self, table_id: str): + """Generate dictionary of annotations for the given Synapse file. + Synapse returns all custom annotations as lists since they + can contain multiple values. In all cases, the values will + be converted into strings and concatenated with ", ". + + Args: + fileId (str): Synapse ID for dataset file. + + Returns: + dict: Annotations as comma-separated strings. + """ + try: + entity = self.syn.get(table_id, downloadFile=False) + is_table = entity.concreteType.endswith(".TableEntity") + annotations_raw = entity.annotations + except SynapseHTTPError: + # If an error occurs with retrieving entity, skip it + # This could be caused by a temporary file view that + # was deleted since its ID was retrieved + is_file, is_table = False, False + + # Skip anything that isn't a file or folder + if not (is_table): + return None + + annotations = self.getEntityAnnotations(table_id, entity, annotations_raw) + + return annotations + + def getFileAnnotations(self, fileId: str) -> Dict[str, str]: + """Generate dictionary of annotations for the given Synapse file. + Synapse returns all custom annotations as lists since they + can contain multiple values. In all cases, the values will + be converted into strings and concatenated with ", ". + + Args: + fileId (str): Synapse ID for dataset file. + + Returns: + dict: Annotations as comma-separated strings. + """ + + # Get entity metadata, including annotations + try: + entity = self.syn.get(fileId, downloadFile=False) + is_file = entity.concreteType.endswith(".FileEntity") + is_folder = entity.concreteType.endswith(".Folder") + annotations_raw = entity.annotations + except SynapseHTTPError: + # If an error occurs with retrieving entity, skip it + # This could be caused by a temporary file view that + # was deleted since its ID was retrieved + is_file, is_folder = False, False + + # Skip anything that isn't a file or folder + if not (is_file or is_folder): + return None + + annotations = self.getEntityAnnotations(fileId, entity, annotations_raw) + + return annotations + + def getEntityAnnotations(self, fileId, entity, annotations_raw): + # Extract annotations from their lists and stringify. For example: + # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']} + annotations = dict() + for key, vals in annotations_raw.items(): + if isinstance(vals, list) and len(vals) == 1: + annotations[key] = str(vals[0]) + else: + annotations[key] = ", ".join(str(v) for v in vals) + + # Add the file entity ID and eTag, which weren't lists + assert fileId == entity.id, ( + "For some reason, the Synapse ID in the response doesn't match" + "the Synapse ID sent in the request (via synapseclient)." + ) + annotations["entityId"] = fileId + annotations["eTag"] = entity.etag + + return annotations + + def getDatasetAnnotations( + self, datasetId: str, fill_na: bool = True, force_batch: bool = False + ) -> pd.DataFrame: + """Generate table for annotations across all files in given dataset. + + Args: + datasetId (str): Synapse ID for dataset folder. + fill_na (bool): Whether to replace missing values with + blank strings. + force_batch (bool): Whether to force the function to use + the batch mode, which uses a file view to retrieve + annotations for a given dataset. Default to False + unless there are more than 50 files in the dataset. + + Returns: + pd.DataFrame: Table of annotations. + """ + # Get all files in given dataset + dataset_files = self.getFilesInStorageDataset(datasetId) + + # if there are no dataset files, there are no annotations + # return None + if not dataset_files: + return pd.DataFrame() + + dataset_files_map = dict(dataset_files) + dataset_file_ids, _ = list(zip(*dataset_files)) + + # Get annotations for each file from Step 1 + # Batch mode + try_batch = len(dataset_files) >= 50 or force_batch + if try_batch: + try: + logger.info("Trying batch mode for retrieving Synapse annotations") + table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) + except (SynapseAuthenticationError, SynapseHTTPError): + logger.info( + f"Unable to create a temporary file view bound to {datasetId}. " + "Defaulting to slower iterative retrieval of annotations." + ) + # Default to the slower non-batch method + logger.info("Batch mode failed (probably due to permission error)") + try_batch = False + + # Non-batch mode + if not try_batch: + logger.info("Using slower (non-batch) sequential mode") + records = [self.getFileAnnotations(i) for i in dataset_file_ids] + # Remove any annotations for non-file/folders (stored as None) + records = filter(None, records) + table = pd.DataFrame.from_records(records) + + # Add filenames for the files that "survived" annotation retrieval + filenames = [dataset_files_map[i] for i in table["entityId"]] + + if "Filename" not in table.columns: + table.insert(0, "Filename", filenames) + + # Ensure that entityId and eTag are at the end + entity_ids = table.pop("entityId") + etags = table.pop("eTag") + table.insert(len(table.columns), "entityId", entity_ids) + table.insert(len(table.columns), "eTag", etags) + + # Missing values are filled in with empty strings for Google Sheets + if fill_na: + table.fillna("", inplace=True) + + # Force all values as strings + return table.astype(str) + + def raise_final_error(retry_state): + return retry_state.outcome.result() + + def checkIfinAssetView(self, syn_id) -> str: + # get data in administrative fileview for this pipeline + assetViewTable = self.getStorageFileviewTable() + all_files = list(assetViewTable["id"]) + if syn_id in all_files: + return True + else: + return False + + @retry( + stop=stop_after_attempt(5), + wait=wait_chain( + *[wait_fixed(10) for i in range(2)] + + [wait_fixed(15) for i in range(2)] + + [wait_fixed(20)] + ), + retry=retry_if_exception_type(LookupError), + retry_error_callback=raise_final_error, + ) + def getDatasetProject(self, datasetId: str) -> str: + """Get parent project for a given dataset ID. + + Args: + datasetId (str): Synapse entity ID (folder or project). + + Raises: + ValueError: Raised if Synapse ID cannot be retrieved + by the user or if it doesn't appear in the file view. + + Returns: + str: The Synapse ID for the parent project. + """ + + # Subset main file view + dataset_index = self.storageFileviewTable["id"] == datasetId + dataset_row = self.storageFileviewTable[dataset_index] + + # re-query if no datasets found + if dataset_row.empty: + sleep(5) + self._query_fileview() + # Subset main file view + dataset_index = self.storageFileviewTable["id"] == datasetId + dataset_row = self.storageFileviewTable[dataset_index] + + # Return `projectId` for given row if only one found + if len(dataset_row) == 1: + dataset_project = dataset_row["projectId"].values[0] + return dataset_project + + # Otherwise, check if already project itself + try: + syn_object = self.syn.get(datasetId) + if syn_object.properties["concreteType"].endswith("Project"): + return datasetId + except SynapseHTTPError: + raise PermissionError( + f"The given dataset ({datasetId}) isn't accessible with this " + "user. This might be caused by a typo in the dataset Synapse ID." + ) + + # If not, then assume dataset not in file view + raise LookupError( + f"The given dataset ({datasetId}) doesn't appear in the " + f"configured file view ({self.storageFileview}). This might " + "mean that the file view's scope needs to be updated." + ) + + def getDatasetAnnotationsBatch( + self, datasetId: str, dataset_file_ids: Sequence[str] = None + ) -> pd.DataFrame: + """Generate table for annotations across all files in given dataset. + This function uses a temporary file view to generate a table + instead of iteratively querying for individual entity annotations. + This function is expected to run much faster than + `self.getDatasetAnnotationsBatch` on large datasets. + + Args: + datasetId (str): Synapse ID for dataset folder. + dataset_file_ids (Sequence[str]): List of Synapse IDs + for dataset files/folders used to subset the table. + + Returns: + pd.DataFrame: Table of annotations. + """ + # Create data frame from annotations file view + with DatasetFileView(datasetId, self.syn) as fileview: + table = fileview.query() + + if dataset_file_ids: + table = table.loc[table.index.intersection(dataset_file_ids)] + + table = table.reset_index(drop=True) + + return table + + def _get_table_schema_by_cname(self, table_schema): + # assume no duplicate column names in the table + table_schema_by_cname = {} + + for col_record in table_schema: + # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key) + table_schema_by_cname[col_record["name"]] = col_record + + return table_schema_by_cname + + +class TableOperations: + """ + Object to hold functions for various table operations specific to the Synapse Asset Store. + + Currently implement operations are: + createTable: upload a manifest as a new table when none exist + replaceTable: replace a metadata in a table from one manifest with metadata from another manifest + updateTable: add a column to a table that already exists on synapse + + Operations currently in development are: + upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest + """ + + def __init__( + self, + synStore: SynapseStorage, + tableToLoad: pd.DataFrame = None, + tableName: str = None, + datasetId: str = None, + existingTableId: str = None, + restrict: bool = False, + ): + """ + Class governing table operations (creation, replacement, upserts, updates) in schematic + + tableToLoad: manifest formatted appropriately for the table + tableName: name of the table to be uploaded + datasetId: synID of the dataset for the manifest + existingTableId: synId of the table currently exising on synapse (if there is one) + restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions + + """ + self.synStore = synStore + self.tableToLoad = tableToLoad + self.tableName = tableName + self.datasetId = datasetId + self.existingTableId = existingTableId + self.restrict = restrict + + def createTable( + self, + columnTypeDict: dict = None, + specifySchema: bool = True, + ): + """ + Method to create a table from a metadata manifest and upload it to synapse + + Args: + columnTypeDict: dictionary schema for table columns: type, size, etc + specifySchema: to specify a specific schema for the table format + + Returns: + table.schema.id: synID of the newly created table + """ + + datasetEntity = self.synStore.syn.get(self.datasetId, downloadFile=False) + datasetName = datasetEntity.name + table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) + + if not self.tableName: + self.tableName = datasetName + "table" + datasetParentProject = self.synStore.getDatasetProject(self.datasetId) + if specifySchema: + if columnTypeDict == {}: + logger.error("Did not provide a columnTypeDict.") + # create list of columns: + cols = [] + for col in self.tableToLoad.columns: + if col in table_schema_by_cname: + col_type = table_schema_by_cname[col]["columnType"] + max_size = ( + table_schema_by_cname[col]["maximumSize"] + if "maximumSize" in table_schema_by_cname[col].keys() + else 100 + ) + max_list_len = 250 + if max_size and max_list_len: + cols.append( + Column( + name=col, + columnType=col_type, + maximumSize=max_size, + maximumListLength=max_list_len, + ) + ) + elif max_size: + cols.append( + Column(name=col, columnType=col_type, maximumSize=max_size) + ) + else: + cols.append(Column(name=col, columnType=col_type)) + else: + # TODO add warning that the given col was not found and it's max size is set to 100 + cols.append(Column(name=col, columnType="STRING", maximumSize=100)) + schema = Schema( + name=self.tableName, columns=cols, parent=datasetParentProject + ) + table = Table(schema, self.tableToLoad) + table = self.synStore.syn.store(table, isRestricted=self.restrict) + return table.schema.id + else: + # For just uploading the tables to synapse using default + # column types. + table = build_table(self.tableName, datasetParentProject, self.tableToLoad) + table = self.synStore.syn.store(table, isRestricted=self.restrict) + return table.schema.id + + def replaceTable( + self, + specifySchema: bool = True, + columnTypeDict: dict = None, + ): + """ + Method to replace an existing table on synapse with metadata from a new manifest + + Args: + specifySchema: to infer a schema for the table format + columnTypeDict: dictionary schema for table columns: type, size, etc + + Returns: + existingTableId: synID of the already existing table that had its metadata replaced + """ + datasetEntity = self.synStore.syn.get(self.datasetId, downloadFile=False) + datasetName = datasetEntity.name + table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) + existing_table, existing_results = self.synStore.get_synapse_table( + self.existingTableId + ) + # remove rows + self.synStore.syn.delete(existing_results) + # wait for row deletion to finish on synapse before getting empty table + sleep(10) + + # removes all current columns + current_table = self.synStore.syn.get(self.existingTableId) + current_columns = self.synStore.syn.getTableColumns(current_table) + for col in current_columns: + current_table.removeColumn(col) + + if not self.tableName: + self.tableName = datasetName + "table" + + # Process columns according to manifest entries + table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) + datasetParentProject = self.synStore.getDatasetProject(self.datasetId) + if specifySchema: + if columnTypeDict == {}: + logger.error("Did not provide a columnTypeDict.") + # create list of columns: + cols = [] + + for col in self.tableToLoad.columns: + if col in table_schema_by_cname: + col_type = table_schema_by_cname[col]["columnType"] + max_size = ( + table_schema_by_cname[col]["maximumSize"] + if "maximumSize" in table_schema_by_cname[col].keys() + else 100 + ) + max_list_len = 250 + if max_size and max_list_len: + cols.append( + Column( + name=col, + columnType=col_type, + maximumSize=max_size, + maximumListLength=max_list_len, + ) + ) + elif max_size: + cols.append( + Column(name=col, columnType=col_type, maximumSize=max_size) + ) + else: + cols.append(Column(name=col, columnType=col_type)) + else: + # TODO add warning that the given col was not found and it's max size is set to 100 + cols.append(Column(name=col, columnType="STRING", maximumSize=100)) + + # adds new columns to schema + for col in cols: + current_table.addColumn(col) + self.synStore.syn.store(current_table, isRestricted=self.restrict) + + # wait for synapse store to finish + sleep(1) + + # build schema and table from columns and store with necessary restrictions + schema = Schema( + name=self.tableName, columns=cols, parent=datasetParentProject + ) + schema.id = self.existingTableId + table = Table(schema, self.tableToLoad, etag=existing_results.etag) + table = self.synStore.syn.store(table, isRestricted=self.restrict) + else: + logging.error("Must specify a schema for table replacements") + + # remove system metadata from manifest + existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True) + return self.existingTableId + + def _get_auth_token( + self, + ): + authtoken = None + + # Get access token from environment variable if available + # Primarily useful for testing environments, with other possible usefulness for containers + env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") + if env_access_token: + authtoken = env_access_token + return authtoken + + # Get token from authorization header + # Primarily useful for API endpoint functionality + if "Authorization" in self.synStore.syn.default_headers: + authtoken = self.synStore.syn.default_headers["Authorization"].split( + "Bearer " + )[-1] + return authtoken + + # retrive credentials from synapse object + # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe + synapse_object_creds = self.synStore.syn.credentials + if hasattr(synapse_object_creds, "_token"): + authtoken = synapse_object_creds.secret + + # Try getting creds from .synapseConfig file if it exists + # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in + if os.path.exists(CONFIG.synapse_configuration_path): + config = self.synStore.syn.getConfigFile(CONFIG.synapse_configuration_path) + + # check which credentials are provided in file + if config.has_option("authentication", "authtoken"): + authtoken = config.get("authentication", "authtoken") + + # raise error if required credentials are not found + if not authtoken: + raise NameError( + "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file" + ) + + return authtoken + + def upsertTable(self, dmge: DataModelGraphExplorer): + """ + Method to upsert rows from a new manifest into an existing table on synapse + For upsert functionality to work, primary keys must follow the naming convention of _id + `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. + Currently it is required to use -dl/--use_display_label with table upserts. + + + Args: + dmge: DataModelGraphExplorer instance + + Returns: + existingTableId: synID of the already existing table that had its metadata replaced + """ + + authtoken = self._get_auth_token() + + synapseDB = SynapseDatabase( + auth_token=authtoken, + project_id=self.synStore.getDatasetProject(self.datasetId), + ) + + try: + # Try performing upsert + synapseDB.upsert_table_rows( + table_name=self.tableName, data=self.tableToLoad + ) + except SynapseHTTPError as ex: + # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload + if "Id is not a valid column name or id" in str(ex): + self._update_table_uuid_column(dmge) + synapseDB.upsert_table_rows( + table_name=self.tableName, data=self.tableToLoad + ) + # Raise if other error + else: + raise ex + + return self.existingTableId + + def _update_table_uuid_column( + self, + dmge: DataModelGraphExplorer, + ) -> None: + """Removes the `Uuid` column when present, and relpaces with an `Id` column + Used to enable backwards compatability for manifests using the old `Uuid` convention + + Args: + dmge: DataModelGraphExplorer instance + + Returns: + None + """ + + # Get the columns of the schema + schema = self.synStore.syn.get(self.existingTableId) + cols = self.synStore.syn.getTableColumns(schema) + + # Iterate through columns until `Uuid` column is found + for col in cols: + if col.name.lower() == "uuid": + # See if schema has `Uuid` column specified + try: + uuid_col_in_schema = dmge.is_class_in_schema(col.name) + except KeyError: + uuid_col_in_schema = False + + # If there is, then create a new `Id` column from scratch + if uuid_col_in_schema: + new_col = Column(columnType="STRING", maximumSize=64, name="Id") + schema.addColumn(new_col) + schema = self.synStore.syn.store(schema) + # If there is not, then use the old `Uuid` column as a basis for the new `Id` column + else: + # Build ColumnModel that will be used for new column + id_column = Column( + name="Id", + columnType="STRING", + maximumSize=64, + defaultValue=None, + maximumListLength=1, + ) + new_col_response = self.synStore.syn.store(id_column) + + # Define columnChange body + columnChangeDict = { + "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest", + "entityId": self.existingTableId, + "changes": [ + { + "oldColumnId": col["id"], + "newColumnId": new_col_response["id"], + } + ], + } + + self.synStore.syn._async_table_update( + table=self.existingTableId, + changes=[columnChangeDict], + wait=False, + ) + break + + return + + def updateTable( + self, + update_col: str = "Id", + ): + """ + Method to update an existing table with a new column + + Args: + updateCol: column to index the old and new tables on + + Returns: + existingTableId: synID of the already existing table that had its metadata replaced + """ + existing_table, existing_results = self.synStore.get_synapse_table( + self.existingTableId + ) + + self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col) + # store table with existing etag data and impose restrictions as appropriate + self.synStore.syn.store( + Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag), + isRestricted=self.restrict, + ) + + return self.existingTableId + + +class DatasetFileView: + """Helper class to create temporary dataset file views. + This class can be used in conjunction with a 'with' statement. + This will ensure that the file view is deleted automatically. + See SynapseStorage.getDatasetAnnotationsBatch for example usage. + """ + + def __init__( + self, + datasetId: str, + synapse: Synapse, + name: str = None, + temporary: bool = True, + parentId: str = None, + ) -> None: + """Create a file view scoped to a dataset folder. + + Args: + datasetId (str): Synapse ID for a dataset folder/project. + synapse (Synapse): Used for Synapse requests. + name (str): Name of the file view (temporary or not). + temporary (bool): Whether to delete the file view on exit + of either a 'with' statement or Python entirely. + parentId (str, optional): Synapse ID specifying where to + store the file view. Defaults to datasetId. + """ + + self.datasetId = datasetId + self.synapse = synapse + self.is_temporary = temporary + + if name is None: + self.name = f"schematic annotation file view for {self.datasetId}" + + if self.is_temporary: + uid = secrets.token_urlsafe(5) + self.name = f"{self.name} - UID {uid}" + + # TODO: Allow a DCC admin to configure a "universal parent" + # Such as a Synapse project writeable by everyone. + self.parentId = datasetId if parentId is None else parentId + + # TODO: Create local sharing setting to hide from everyone else + view_schema = EntityViewSchema( + name=self.name, + parent=self.parentId, + scopes=self.datasetId, + includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER], + addDefaultViewColumns=False, + addAnnotationColumns=True, + ) + + # TODO: Handle failure due to insufficient permissions by + # creating a temporary new project to store view + self.view_schema = self.synapse.store(view_schema) + + # These are filled in after calling `self.query()` + self.results = None + self.table = None + + # Ensure deletion of the file view (last resort) + if self.is_temporary: + atexit.register(self.delete) + + def __enter__(self): + """Return file view when entering 'with' statement.""" + return self + + def __exit__(self, exc_type, exc_value, traceback): + """Delete file view when exiting 'with' statement.""" + if self.is_temporary: + self.delete() + + def delete(self): + """Delete the file view on Synapse without deleting local table.""" + if self.view_schema is not None: + self.synapse.delete(self.view_schema) + self.view_schema = None + + def query(self, tidy=True, force=False): + """Retrieve file view as a data frame (raw format sans index).""" + if self.table is None or force: + fileview_id = self.view_schema["id"] + self.results = self.synapse.tableQuery(f"select * from {fileview_id}") + self.table = self.results.asDataFrame(rowIdAndVersionInIndex=False) + if tidy: + self.tidy_table() + return self.table + + def tidy_table(self): + """Convert raw file view data frame into more usable format.""" + assert self.table is not None, "Must call `self.query()` first." + self._fix_default_columns() + self._fix_list_columns() + self._fix_int_columns() + return self.table + + def _fix_default_columns(self): + """Rename default columns to match schematic expectations.""" + + # Drop ROW_VERSION column if present + if "ROW_VERSION" in self.table: + del self.table["ROW_VERSION"] + + # Rename id column to entityId and set as data frame index + if "ROW_ID" in self.table: + self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str) + self.table = self.table.set_index("entityId", drop=False) + del self.table["ROW_ID"] + + # Rename ROW_ETAG column to eTag and place at end of data frame + if "ROW_ETAG" in self.table: + row_etags = self.table.pop("ROW_ETAG") + self.table.insert(len(self.table.columns), "eTag", row_etags) + + return self.table + + def _get_columns_of_type(self, types): + """Helper function to get list of columns of a given type(s).""" + matching_columns = [] + for header in self.results.headers: + if header.columnType in types: + matching_columns.append(header.name) + return matching_columns + + def _fix_list_columns(self): + """Fix formatting of list-columns.""" + list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"} + list_columns = self._get_columns_of_type(list_types) + for col in list_columns: + self.table[col] = self.table[col].apply(lambda x: ", ".join(x)) + return self.table + + def _fix_int_columns(self): + """Ensure that integer-columns are actually integers.""" + int_columns = self._get_columns_of_type({"INTEGER"}) + for col in int_columns: + # Coercing to string because NaN is a floating point value + # and cannot exist alongside integers in a column + to_int_fn = lambda x: "" if np.isnan(x) else str(int(x)) + self.table[col] = self.table[col].apply(to_int_fn) + return self.table From 297abb1a9b16e26508831f13037de7a0b310c558 Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 5 Jun 2024 11:42:39 -0400 Subject: [PATCH 036/110] edit tracing --- schematic/manifest/generator.py | 8 ++++++-- schematic/schemas/data_model_graph.py | 5 +++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/schematic/manifest/generator.py b/schematic/manifest/generator.py index d29c95f13..77ddc085a 100644 --- a/schematic/manifest/generator.py +++ b/schematic/manifest/generator.py @@ -1575,8 +1575,10 @@ def _handle_output_format_logic( # Default return a DataFrame else: return dataframe - + + @staticmethod + @tracer.start_as_current_span("ManifestGenerator::create_single_manifest") def create_single_manifest( path_to_data_model: str, graph_data_model: nx.MultiDiGraph, @@ -1630,6 +1632,7 @@ def create_single_manifest( return result @staticmethod + @tracer.start_as_current_span("ManifestGenerator::create_manifests") def create_manifests( path_to_data_model: str, data_types: list, @@ -1757,7 +1760,8 @@ def create_manifests( return result return all_results - + + @tracer.start_as_current_span("ManifestGenerator::get_manifest") def get_manifest( self, dataset_id: str = None, diff --git a/schematic/schemas/data_model_graph.py b/schematic/schemas/data_model_graph.py index 59bc5e96e..cc3b7dd94 100644 --- a/schematic/schemas/data_model_graph.py +++ b/schematic/schemas/data_model_graph.py @@ -19,9 +19,13 @@ from schematic.utils.general import unlist from schematic.utils.viz_utils import visualize from schematic.utils.validate_utils import rule_in_rule_list +from opentelemetry import trace + +logger = logging.getLogger(__name__) logger = logging.getLogger(__name__) +tracer = trace.get_tracer("schemas::DataModelGraph") class DataModelGraphMeta: # pylint: disable=too-few-public-methods @@ -85,6 +89,7 @@ def __init__( ) self.graph = self.generate_data_model_graph() + @tracer.start_as_current_span("DataModelGraph::generate_data_model_graph") def generate_data_model_graph(self) -> nx.MultiDiGraph: """ Generate NetworkX Graph from the Relationships/attributes dictionary, the graph is built From 0185d13482775e5b0d5365cf89477b3b6c78f6d7 Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 5 Jun 2024 11:44:22 -0400 Subject: [PATCH 037/110] update config --- config_example.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/config_example.yml b/config_example.yml index 245b8fefe..9125cb6bb 100644 --- a/config_example.yml +++ b/config_example.yml @@ -35,8 +35,6 @@ model: # This section is for using google sheets with Schematic google_sheets: - # The Synapse id of the Google service account credentials. - service_acct_creds_synapse_id: "syn25171627" # Path to the synapse config file, either absolute or relative to this file service_acct_creds: "schematic_service_account_creds.json" # When doing google sheet validation (regex match) with the validation rules. From c428afd47a80eee9ecb0dc339b1e2cab5133b6f6 Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 5 Jun 2024 11:45:06 -0400 Subject: [PATCH 038/110] update config in readme --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 228e0e779..3d0bf04ca 100644 --- a/README.md +++ b/README.md @@ -141,8 +141,6 @@ model: # This section is for using google sheets with Schematic google_sheets: - # The Synapse id of the Google service account credentials. - service_acct_creds_synapse_id: "syn25171627" # Path to the synapse config file, either absolute or relative to this file service_acct_creds: "schematic_service_account_creds.json" # When doing google sheet validation (regex match) with the validation rules. From d33ec25be3806ce90664db8cd9e9c1008257cf2a Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 5 Jun 2024 11:51:06 -0400 Subject: [PATCH 039/110] remove service_acc_creds_synapse_id --- schematic/configuration/configuration.py | 8 -------- schematic/configuration/dataclasses.py | 6 +----- tests/data/test_configs/default_config.yml | 1 - tests/data/test_configs/valid_config.yml | 1 - tests/data/test_configs/valid_config2.yml | 1 - tests/test_configuration.py | 4 ---- 6 files changed, 1 insertion(+), 20 deletions(-) diff --git a/schematic/configuration/configuration.py b/schematic/configuration/configuration.py index 1bd3f1c40..63bf55313 100644 --- a/schematic/configuration/configuration.py +++ b/schematic/configuration/configuration.py @@ -164,14 +164,6 @@ def model_location(self) -> str: """ return self._model_config.location - @property - def service_account_credentials_synapse_id(self) -> str: - """ - Returns: - str: The Synapse id of the Google service account credentials. - """ - return self._google_sheets_config.service_acct_creds_synapse_id - @property def service_account_credentials_path(self) -> str: """ diff --git a/schematic/configuration/dataclasses.py b/schematic/configuration/dataclasses.py index 7fbc7df57..1ffe226b3 100644 --- a/schematic/configuration/dataclasses.py +++ b/schematic/configuration/dataclasses.py @@ -124,12 +124,9 @@ class GoogleSheetsConfig: strict_validation: When doing google sheet validation (regex match) with the validation rules. True is alerting the user and not allowing entry of bad values. False is warning but allowing the entry on to the sheet. - service_acct_creds_synapse_id: The Synapse id of the Google service account credentials. service_acct_creds: Path to the Google service account credentials, either absolute or relative to this file """ - - service_acct_creds_synapse_id: str = "syn25171627" service_acct_creds: str = "schematic_service_account_creds.json" strict_validation: bool = True @@ -150,8 +147,7 @@ def validate_string_is_not_empty(cls, value: str) -> str: if not value: raise ValueError(f"{value} is an empty string") return value - - @validator("service_acct_creds_synapse_id") + @classmethod def validate_synapse_id(cls, value: str) -> str: """Check if string is a valid synapse id diff --git a/tests/data/test_configs/default_config.yml b/tests/data/test_configs/default_config.yml index 6775b569a..5a1785dc2 100644 --- a/tests/data/test_configs/default_config.yml +++ b/tests/data/test_configs/default_config.yml @@ -16,6 +16,5 @@ model: location: 'tests/data/example.model.jsonld' google_sheets: - service_acct_creds_synapse_id: 'syn25171627' service_acct_creds: "schematic_service_account_creds.json" strict_validation: true diff --git a/tests/data/test_configs/valid_config.yml b/tests/data/test_configs/valid_config.yml index 3e340721c..456c3ccd7 100644 --- a/tests/data/test_configs/valid_config.yml +++ b/tests/data/test_configs/valid_config.yml @@ -16,6 +16,5 @@ model: location: "model.jsonld" google_sheets: - service_acct_creds_synapse_id: "syn1" service_acct_creds: "creds.json" strict_validation: false diff --git a/tests/data/test_configs/valid_config2.yml b/tests/data/test_configs/valid_config2.yml index 78306ee18..e1c85ab4a 100644 --- a/tests/data/test_configs/valid_config2.yml +++ b/tests/data/test_configs/valid_config2.yml @@ -10,6 +10,5 @@ model: location: "model.jsonld" google_sheets: - service_acct_creds_synapse_id: "syn1" service_acct_creds: "creds.json" strict_validation: false diff --git a/tests/test_configuration.py b/tests/test_configuration.py index 8845a9b48..3140148f3 100644 --- a/tests/test_configuration.py +++ b/tests/test_configuration.py @@ -80,7 +80,6 @@ def test_google_sheets_config(self) -> None: assert isinstance( GoogleSheetsConfig( service_acct_creds="file_name", - service_acct_creds_synapse_id="syn1", strict_validation=True, ), GoogleSheetsConfig, @@ -88,19 +87,16 @@ def test_google_sheets_config(self) -> None: with pytest.raises(ValidationError): GoogleSheetsConfig( service_acct_creds="file_name", - service_acct_creds_synapse_id="syn1", strict_validation="tru", ) with pytest.raises(ValidationError): GoogleSheetsConfig( service_acct_creds="", - service_acct_creds_synapse_id="syn1", strict_validation=True, ) with pytest.raises(ValidationError): GoogleSheetsConfig( service_acct_creds="file_name", - service_acct_creds_synapse_id="syn", strict_validation=True, ) From a1edc5a71583f6e35aa80625c6f540f30fbe5d7b Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 5 Jun 2024 12:14:00 -0400 Subject: [PATCH 040/110] remove test that is no longer relevant --- tests/test_configuration.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/test_configuration.py b/tests/test_configuration.py index 3140148f3..7a27c7a34 100644 --- a/tests/test_configuration.py +++ b/tests/test_configuration.py @@ -94,11 +94,6 @@ def test_google_sheets_config(self) -> None: service_acct_creds="", strict_validation=True, ) - with pytest.raises(ValidationError): - GoogleSheetsConfig( - service_acct_creds="file_name", - strict_validation=True, - ) class TestConfiguration: @@ -116,7 +111,6 @@ def test_init(self) -> None: assert config.manifest_title == "example" assert config.manifest_data_type == ["Biospecimen", "Patient"] assert config.model_location == "tests/data/example.model.jsonld" - assert config.service_account_credentials_synapse_id assert ( config.service_account_credentials_path != "schematic_service_account_creds.json" @@ -154,7 +148,6 @@ def test_load_config1(self) -> None: assert config.manifest_title == "example" assert config.manifest_data_type == ["Biospecimen", "Patient"] assert config.model_location == "tests/data/example.model.jsonld" - assert config.service_account_credentials_synapse_id assert ( config.service_account_credentials_path != "schematic_service_account_creds.json" @@ -184,7 +177,6 @@ def test_load_config2(self) -> None: assert config.manifest_title == "title" assert config.manifest_data_type == ["data_type"] assert config.model_location == "model.jsonld" - assert config.service_account_credentials_synapse_id assert os.path.basename(config.service_account_credentials_path) == "creds.json" assert config.google_sheets_master_template_id == ( "1LYS5qE4nV9jzcYw5sXwCza25slDfRA1CIg3cs-hCdpU" From c31425226fabf7aee9d0c0302acdd8107ffca114 Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 5 Jun 2024 12:14:43 -0400 Subject: [PATCH 041/110] remove download creds function --- schematic/utils/google_api_utils.py | 34 ----------------------------- 1 file changed, 34 deletions(-) diff --git a/schematic/utils/google_api_utils.py b/schematic/utils/google_api_utils.py index 9a5d870ca..abf12bfd0 100644 --- a/schematic/utils/google_api_utils.py +++ b/schematic/utils/google_api_utils.py @@ -66,40 +66,6 @@ def build_service_account_creds() -> GoogleServiceAcountCreds: } return creds - -def download_creds_file() -> None: - """Download google credentials file""" - syn = SynapseStorage.login() - - # if file path of service_account does not exist - # and if an environment variable related to service account is not found - # regenerate service_account credentials - if ( - not os.path.exists(CONFIG.service_account_credentials_path) - and "SERVICE_ACCOUNT_CREDS" not in os.environ - ): - # synapse ID of the 'schematic_service_account_creds.json' file - api_creds = CONFIG.service_account_credentials_synapse_id - - # Download in parent directory of SERVICE_ACCT_CREDS to - # ensure same file system for os.rename() - creds_dir = os.path.dirname(CONFIG.service_account_credentials_path) - - creds_file = syn.get(api_creds, downloadLocation=creds_dir) - os.rename(creds_file.path, CONFIG.service_account_credentials_path) - - logger.info( - "The credentials file has been downloaded " - f"to '{CONFIG.service_account_credentials_path}'" - ) - - elif "SERVICE_ACCOUNT_CREDS" in os.environ: - # remind users that "SERVICE_ACCOUNT_CREDS" as an environment variable is being used - logger.info( - "Using environment variable SERVICE_ACCOUNT_CREDS as the credential file." - ) - - @no_type_check def execute_google_api_requests(service, requests_body, **kwargs) -> Any: """ From caf267f1a256b1138f169aff0021d42bd1bfbed2 Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 5 Jun 2024 12:26:00 -0400 Subject: [PATCH 042/110] reformat code --- schematic/configuration/dataclasses.py | 3 ++- schematic/utils/google_api_utils.py | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/schematic/configuration/dataclasses.py b/schematic/configuration/dataclasses.py index 1ffe226b3..4b3d1560f 100644 --- a/schematic/configuration/dataclasses.py +++ b/schematic/configuration/dataclasses.py @@ -127,6 +127,7 @@ class GoogleSheetsConfig: service_acct_creds: Path to the Google service account credentials, either absolute or relative to this file """ + service_acct_creds: str = "schematic_service_account_creds.json" strict_validation: bool = True @@ -147,7 +148,7 @@ def validate_string_is_not_empty(cls, value: str) -> str: if not value: raise ValueError(f"{value} is an empty string") return value - + @classmethod def validate_synapse_id(cls, value: str) -> str: """Check if string is a valid synapse id diff --git a/schematic/utils/google_api_utils.py b/schematic/utils/google_api_utils.py index abf12bfd0..400372c11 100644 --- a/schematic/utils/google_api_utils.py +++ b/schematic/utils/google_api_utils.py @@ -36,7 +36,12 @@ def build_service_account_creds() -> GoogleServiceAcountCreds: Returns: GoogleServiceAcountCreds: The credentials """ + # del os.environ['SERVICE_ACCOUNT_CREDS'] if "SERVICE_ACCOUNT_CREDS" in os.environ: + del os.environ["SERVICE_ACCOUNT_CREDS"] + + if "SERVICE_ACCOUNT_CREDS" in os.environ: + breakpoint() dict_creds = json.loads(os.environ["SERVICE_ACCOUNT_CREDS"]) credentials = service_account.Credentials.from_service_account_info( dict_creds, scopes=SCOPES @@ -66,6 +71,7 @@ def build_service_account_creds() -> GoogleServiceAcountCreds: } return creds + @no_type_check def execute_google_api_requests(service, requests_body, **kwargs) -> Any: """ From 71392bf366589894988f0f851454511ae6e20734 Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 5 Jun 2024 12:33:13 -0400 Subject: [PATCH 043/110] remove breakpoint --- schematic/utils/google_api_utils.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/schematic/utils/google_api_utils.py b/schematic/utils/google_api_utils.py index 400372c11..e1ed13905 100644 --- a/schematic/utils/google_api_utils.py +++ b/schematic/utils/google_api_utils.py @@ -36,12 +36,7 @@ def build_service_account_creds() -> GoogleServiceAcountCreds: Returns: GoogleServiceAcountCreds: The credentials """ - # del os.environ['SERVICE_ACCOUNT_CREDS'] if "SERVICE_ACCOUNT_CREDS" in os.environ: - del os.environ["SERVICE_ACCOUNT_CREDS"] - - if "SERVICE_ACCOUNT_CREDS" in os.environ: - breakpoint() dict_creds = json.loads(os.environ["SERVICE_ACCOUNT_CREDS"]) credentials = service_account.Credentials.from_service_account_info( dict_creds, scopes=SCOPES From c33d1bbd73e5cb7152a0a53fa118024d8105c3cd Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 5 Jun 2024 12:34:22 -0400 Subject: [PATCH 044/110] remove unused import --- schematic/utils/google_api_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/schematic/utils/google_api_utils.py b/schematic/utils/google_api_utils.py index e1ed13905..b705e0419 100644 --- a/schematic/utils/google_api_utils.py +++ b/schematic/utils/google_api_utils.py @@ -11,7 +11,6 @@ from googleapiclient.discovery import build, Resource # type: ignore from google.oauth2 import service_account # type: ignore from schematic.configuration.configuration import CONFIG -from schematic.store.synapse import SynapseStorage logger = logging.getLogger(__name__) From 9c25e90c1a5093a9d11f84cd7157162780bd58cb Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Wed, 5 Jun 2024 10:17:08 -0700 Subject: [PATCH 045/110] add maintainers to Dockerfile --- schematic_api/Dockerfile | 248 ++++++++++++++++++++------------------- 1 file changed, 128 insertions(+), 120 deletions(-) diff --git a/schematic_api/Dockerfile b/schematic_api/Dockerfile index 53f63533f..316fcff67 100644 --- a/schematic_api/Dockerfile +++ b/schematic_api/Dockerfile @@ -1,120 +1,128 @@ -FROM tiangolo/uwsgi-nginx-flask:python3.10 - -# add version tag as a build argument -ARG TAG - -# the environment variables defined here are the default -# and can be overwritten by docker run -e VARIABLE = XX -# or can be overwritten by .env when using docker compose -ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=off \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=200 \ - POETRY_VERSION=1.3.0 \ - APP_PARENT_DIR=/app \ - NGINX_CONFIG=/etc/nginx/conf.d \ - APP_DIR=/app/app \ - ROOT=/ \ - UWSGI_INI=/app/uwsgi.ini \ - NGINX_WORKER_PROCESSES=1 \ - VERSION=$TAG - -# Note: -# The starting number of uWSGI processes is controlled by the variable UWSGI_CHEAPER, by default set to 2. -# The maximum number of uWSGI processes is controlled by the variable UWSGI_PROCESSES, by default set to 16 -# By default, the image starts with 2 uWSGI processes running. When the server is experiencing a high load, it creates up to 16 uWSGI processes to handle it on demand. -# NGINX_MAX_UPLOAD is set to 0 by default that allows unlimited upload file sizes -# NGINX_WORKER_CONNECTIONS is set to 1024 by default that allows a maximum limit of 1024 connections per worker. -# NGINX_WORKER_OPEN_FILES is set to 2048 by default that allows 2048 open files - -# run open ssl and generate certificate -RUN apt update && \ - apt-get install openssl && \ - openssl req -x509 -nodes -days 365 \ - -subj "/C=CA/ST=QC/O=Company" \ - -newkey rsa:2048 -keyout /etc/ssl/private/localhost.key \ - -out /etc/ssl/certs/localhost.crt; - -# add dhparam.pem -# this step takes quite some time -RUN openssl dhparam -out /etc/ssl/dhparam.pem 4096 - -# copy config files that handle encryption to docker -WORKDIR ${NGINX_CONFIG} -COPY ./self-signed.conf ./ssl-params.conf ./certificate.conf ./ - - -# use custom uwsgi-nginx-entrypoint.sh -# this uwsgi-nginx-entrypoint.sh file is derived from: https://github.com/tiangolo/uwsgi-nginx-flask-docker/blob/master/docker-images/entrypoint.sh -# we have to modify it so that we could generate a different /etc/nginx/conf.d/nginx.conf file -WORKDIR ${ROOT} -COPY ./uwsgi-nginx-entrypoint.sh ./entrypoint2.sh -COPY ./uwsgi-nginx-entrypoint.sh ./uwsgi-nginx-entrypoint2.sh -RUN chmod +x uwsgi-nginx-entrypoint2.sh -RUN chmod +x entrypoint2.sh -RUN chown -R nginx /uwsgi-nginx-entrypoint2.sh -RUN chown -R nginx /entrypoint2.sh - -# install poetry -RUN pip install --no-cache-dir "poetry==$POETRY_VERSION" - -# set work directory -WORKDIR ${APP_PARENT_DIR} -RUN chown www-data:www-data ${APP_PARENT_DIR} - -# remove the old uwsgi.ini and main.py from the original image -RUN rm -rf ${APP_PARENT_DIR}/main.py -RUN rm -rf ${APP_PARENT_DIR}/uwsgi.ini - -# copy to use custom uwsgi.ini -COPY ./uwsgi.ini ./ - -# create a separate folder called app -RUN mkdir app -WORKDIR ${APP_DIR} - -# copy other files to app/app -# Note: run_api.py is not needed - -COPY ./pyproject.toml ./poetry.lock ./main.py ./ -COPY ./config_example.yml ./config.yml -RUN poetry config virtualenvs.create false -RUN poetry install --no-interaction --all-extras --no-root - -# copy schematic_api folder -COPY schematic_api ./schematic_api - -# copy great_expectations folder -COPY great_expectations ./great_expectations - -# copy tests folder because some endpoints by default download to the tests folder -COPY tests ./tests - -# change permission -RUN chown -R www-data:www-data ${APP_DIR} - -# allow downloading to synapse cache -RUN chown -R www-data:www-data /root - -# copy schematic -COPY schematic ./schematic - -# change permission -WORKDIR /var/www/ -#The -R option: make the command recursive, so it will change the owner of all files and subdirectories within a given folder. -RUN chown -R www-data:www-data /var/www/ - -RUN chown -R www-data:www-data /var/tmp/ - -# change work directory back -WORKDIR ${APP_DIR} - -# specify entrypoint again to generate config -# have to respecify CMD too -ENTRYPOINT ["/entrypoint2.sh"] -CMD ["/start.sh"] - -# Expose ports -EXPOSE 443 +FROM tiangolo/uwsgi-nginx-flask:python3.10 + +# add version tag as a build argument +ARG TAG + +# the environment variables defined here are the default +# and can be overwritten by docker run -e VARIABLE = XX +# or can be overwritten by .env when using docker compose +ENV PYTHONFAULTHANDLER=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONHASHSEED=random \ + PIP_NO_CACHE_DIR=off \ + PIP_DISABLE_PIP_VERSION_CHECK=on \ + PIP_DEFAULT_TIMEOUT=200 \ + POETRY_VERSION=1.3.0 \ + APP_PARENT_DIR=/app \ + NGINX_CONFIG=/etc/nginx/conf.d \ + APP_DIR=/app/app \ + ROOT=/ \ + UWSGI_INI=/app/uwsgi.ini \ + NGINX_WORKER_PROCESSES=1 \ + VERSION=$TAG + + + +LABEL maintainer="Lingling Peng \ + Andrew Lamb \ + Gianna Jordan " + + + +# Note: +# The starting number of uWSGI processes is controlled by the variable UWSGI_CHEAPER, by default set to 2. +# The maximum number of uWSGI processes is controlled by the variable UWSGI_PROCESSES, by default set to 16 +# By default, the image starts with 2 uWSGI processes running. When the server is experiencing a high load, it creates up to 16 uWSGI processes to handle it on demand. +# NGINX_MAX_UPLOAD is set to 0 by default that allows unlimited upload file sizes +# NGINX_WORKER_CONNECTIONS is set to 1024 by default that allows a maximum limit of 1024 connections per worker. +# NGINX_WORKER_OPEN_FILES is set to 2048 by default that allows 2048 open files + +# run open ssl and generate certificate +RUN apt update && \ + apt-get install openssl && \ + openssl req -x509 -nodes -days 365 \ + -subj "/C=CA/ST=QC/O=Company" \ + -newkey rsa:2048 -keyout /etc/ssl/private/localhost.key \ + -out /etc/ssl/certs/localhost.crt; + +# add dhparam.pem +# this step takes quite some time +RUN openssl dhparam -out /etc/ssl/dhparam.pem 4096 + +# copy config files that handle encryption to docker +WORKDIR ${NGINX_CONFIG} +COPY ./self-signed.conf ./ssl-params.conf ./certificate.conf ./ + + +# use custom uwsgi-nginx-entrypoint.sh +# this uwsgi-nginx-entrypoint.sh file is derived from: https://github.com/tiangolo/uwsgi-nginx-flask-docker/blob/master/docker-images/entrypoint.sh +# we have to modify it so that we could generate a different /etc/nginx/conf.d/nginx.conf file +WORKDIR ${ROOT} +COPY ./uwsgi-nginx-entrypoint.sh ./entrypoint2.sh +COPY ./uwsgi-nginx-entrypoint.sh ./uwsgi-nginx-entrypoint2.sh +RUN chmod +x uwsgi-nginx-entrypoint2.sh +RUN chmod +x entrypoint2.sh +RUN chown -R nginx /uwsgi-nginx-entrypoint2.sh +RUN chown -R nginx /entrypoint2.sh + +# install poetry +RUN pip install --no-cache-dir "poetry==$POETRY_VERSION" + +# set work directory +WORKDIR ${APP_PARENT_DIR} +RUN chown www-data:www-data ${APP_PARENT_DIR} + +# remove the old uwsgi.ini and main.py from the original image +RUN rm -rf ${APP_PARENT_DIR}/main.py +RUN rm -rf ${APP_PARENT_DIR}/uwsgi.ini + +# copy to use custom uwsgi.ini +COPY ./uwsgi.ini ./ + +# create a separate folder called app +RUN mkdir app +WORKDIR ${APP_DIR} + +# copy other files to app/app +# Note: run_api.py is not needed + +COPY ./pyproject.toml ./poetry.lock ./main.py ./ +COPY ./config_example.yml ./config.yml +RUN poetry config virtualenvs.create false +RUN poetry install --no-interaction --all-extras --no-root + +# copy schematic_api folder +COPY schematic_api ./schematic_api + +# copy great_expectations folder +COPY great_expectations ./great_expectations + +# copy tests folder because some endpoints by default download to the tests folder +COPY tests ./tests + +# change permission +RUN chown -R www-data:www-data ${APP_DIR} + +# allow downloading to synapse cache +RUN chown -R www-data:www-data /root + +# copy schematic +COPY schematic ./schematic + +# change permission +WORKDIR /var/www/ +#The -R option: make the command recursive, so it will change the owner of all files and subdirectories within a given folder. +RUN chown -R www-data:www-data /var/www/ + +RUN chown -R www-data:www-data /var/tmp/ + +# change work directory back +WORKDIR ${APP_DIR} + +# specify entrypoint again to generate config +# have to respecify CMD too +ENTRYPOINT ["/entrypoint2.sh"] +CMD ["/start.sh"] + +# Expose ports +EXPOSE 443 From 2fdc075f9b418d6c99eaad1f9ed3d6e9332b17e9 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Wed, 5 Jun 2024 10:31:42 -0700 Subject: [PATCH 046/110] add version tag to labels --- schematic_api/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/schematic_api/Dockerfile b/schematic_api/Dockerfile index 316fcff67..02983eae0 100644 --- a/schematic_api/Dockerfile +++ b/schematic_api/Dockerfile @@ -26,7 +26,8 @@ ENV PYTHONFAULTHANDLER=1 \ LABEL maintainer="Lingling Peng \ Andrew Lamb \ Gianna Jordan " - + +LABEL version=$TAG # Note: From 0668cb77ecca355a786f41a5febc95dd4a72ea43 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Wed, 5 Jun 2024 11:11:51 -0700 Subject: [PATCH 047/110] update maintainer format --- schematic_api/Dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/schematic_api/Dockerfile b/schematic_api/Dockerfile index 02983eae0..a564b61ec 100644 --- a/schematic_api/Dockerfile +++ b/schematic_api/Dockerfile @@ -23,9 +23,7 @@ ENV PYTHONFAULTHANDLER=1 \ -LABEL maintainer="Lingling Peng \ - Andrew Lamb \ - Gianna Jordan " +LABEL maintainer="Lingling Peng Andrew Lamb " LABEL version=$TAG From d1aafac1b0f13de8ffbde21ffe4ea10bc0e2724d Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Thu, 6 Jun 2024 10:05:27 -0700 Subject: [PATCH 048/110] readd email, formatting --- schematic_api/Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/schematic_api/Dockerfile b/schematic_api/Dockerfile index a564b61ec..15e979dee 100644 --- a/schematic_api/Dockerfile +++ b/schematic_api/Dockerfile @@ -23,8 +23,7 @@ ENV PYTHONFAULTHANDLER=1 \ -LABEL maintainer="Lingling Peng Andrew Lamb " - +LABEL maintainer="Lingling Peng Andrew Lamb Gianna Jordan " LABEL version=$TAG From 71b474e3fcb0cc43e6178c69cdb00475c9cbe180 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 14:30:21 -0400 Subject: [PATCH 049/110] add tracing related to manifest submission --- schematic/models/metadata.py | 3 +++ schematic/store/synapse.py | 1 + schematic_api/api/routes.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/schematic/models/metadata.py b/schematic/models/metadata.py index 4bb00e37b..76e2ee991 100644 --- a/schematic/models/metadata.py +++ b/schematic/models/metadata.py @@ -19,9 +19,11 @@ from schematic.utils.df_utils import load_df from schematic.models.validate_manifest import validate_all +from opentelemetry import trace logger = logging.getLogger(__name__) +tracer = trace.get_tracer("metadata::MetadataModel") class MetadataModel(object): """Metadata model wrapper around schema.org specification graph. @@ -317,6 +319,7 @@ def populateModelManifest( manifestPath, emptyManifestURL, return_excel=return_excel, title=title ) + @tracer.start_as_current_span("MetadataModel::submit_metadata_manifest") def submit_metadata_manifest( # pylint: disable=too-many-arguments, too-many-locals self, manifest_path: str, diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index d078d45d4..cc8d45b89 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -1408,6 +1408,7 @@ def format_row_annotations( return annos @missing_entity_handler + @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations") def format_manifest_annotations(self, manifest, manifest_synapse_id): """ Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index ebc52a431..74d009c19 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -259,6 +259,7 @@ def save_file(file_key="csv_file"): return temp_path +@tracer.start_as_current_span("routes:initalize_metadata_model") def initalize_metadata_model(schema_url, data_model_labels): # get path to temp data model file (csv or jsonld) as appropriate data_model = get_temp_model_path(schema_url) @@ -430,6 +431,7 @@ def validate_manifest_route( #####profile validate manifest route function # @profile(sort_by='cumulative', strip_dirs=True) +@trace_function_params() def submit_manifest_route( schema_url, data_model_labels: str, From 301ca0c9db0204793f1a8cc47baaa38f598f4e8a Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 14:49:53 -0400 Subject: [PATCH 050/110] add jsonify --- poetry.lock | 23 +++++++++++------------ pyproject.toml | 1 + 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/poetry.lock b/poetry.lock index 49550859a..76c5e25eb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1707,6 +1707,16 @@ files = [ {file = "json5-0.9.25.tar.gz", hash = "sha256:548e41b9be043f9426776f05df8635a00fe06104ea51ed24b67f908856e151ae"}, ] +[[package]] +name = "jsonify" +version = "0.5" +description = "A csv to json converter" +optional = false +python-versions = "*" +files = [ + {file = "jsonify-0.5.tar.gz", hash = "sha256:f340032753577575e9777835809b283fdc9b251867d5d5600389131647f8bfe1"}, +] + [[package]] name = "jsonpatch" version = "1.33" @@ -3350,7 +3360,6 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, - {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -3358,16 +3367,8 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, - {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, - {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, - {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, - {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -3384,7 +3385,6 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, - {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -3392,7 +3392,6 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, - {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -4862,4 +4861,4 @@ aws = ["uWSGI"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.11" -content-hash = "5bf0c831977694ea541db24481181ec1980ec9589a2adbd9f30ed0fe7f2b2742" +content-hash = "98bab32c3d26e455ed16a6ab56a14af398b9516c4de6616e1bf7de3281ffb839" diff --git a/pyproject.toml b/pyproject.toml index 03b88c81e..48f2b13da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,7 @@ Flask = {version = "2.1.3", optional = true} Flask-Cors = {version = "^3.0.10", optional = true} uWSGI = {version = "^2.0.21", optional = true} Jinja2 = {version = ">2.11.3", optional = true} +jsonify = "^0.5" [tool.poetry.extras] api = ["connexion", "Flask", "Flask-Cors", "Jinja2", "pyopenssl"] From f94e29763296a7ac327ddc0eddc2859d8b896a57 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 14:50:33 -0400 Subject: [PATCH 051/110] modify init to add tracing --- schematic_api/api/__init__.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/schematic_api/api/__init__.py b/schematic_api/api/__init__.py index 82bad7e9b..b04ffada0 100644 --- a/schematic_api/api/__init__.py +++ b/schematic_api/api/__init__.py @@ -3,6 +3,24 @@ import connexion from schematic import CONFIG +from jaeger_client import Config +from flask_opentracing import FlaskTracer +import traceback +import jsonify + +config = Config( + config={ + 'enabled': True, + 'sampler': { + 'type': 'const', + 'param': 1 + }, + 'logging': True, + }, + service_name="schema-api", +) +jaeger_tracer = config.initialize_tracer + def create_app(): connexionapp = connexion.FlaskApp(__name__, specification_dir="openapi/") @@ -34,8 +52,10 @@ def create_app(): app = create_app() +flask_tracer = FlaskTracer(jaeger_tracer, True, app, ['url', 'url_rule', 'environ.HTTP_X_REAL_IP', 'path']) + # def route_code(): # import flask_schematic as sc # sc.method1() -# \ No newline at end of file +#] From 7d81c197056e771d63b2acf16389ef5bd6e3c9eb Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 15:07:50 -0400 Subject: [PATCH 052/110] remove jsonify --- poetry.lock | 12 +----------- pyproject.toml | 7 ------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/poetry.lock b/poetry.lock index 76c5e25eb..a82fc529c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1707,16 +1707,6 @@ files = [ {file = "json5-0.9.25.tar.gz", hash = "sha256:548e41b9be043f9426776f05df8635a00fe06104ea51ed24b67f908856e151ae"}, ] -[[package]] -name = "jsonify" -version = "0.5" -description = "A csv to json converter" -optional = false -python-versions = "*" -files = [ - {file = "jsonify-0.5.tar.gz", hash = "sha256:f340032753577575e9777835809b283fdc9b251867d5d5600389131647f8bfe1"}, -] - [[package]] name = "jsonpatch" version = "1.33" @@ -4861,4 +4851,4 @@ aws = ["uWSGI"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.11" -content-hash = "98bab32c3d26e455ed16a6ab56a14af398b9516c4de6616e1bf7de3281ffb839" +content-hash = "c380beeada202005657c611f9035bb7e7f7ffba721ff2d893d0f890a6d0b225e" diff --git a/pyproject.toml b/pyproject.toml index 48f2b13da..1e00f3730 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,6 @@ Flask = {version = "2.1.3", optional = true} Flask-Cors = {version = "^3.0.10", optional = true} uWSGI = {version = "^2.0.21", optional = true} Jinja2 = {version = ">2.11.3", optional = true} -jsonify = "^0.5" [tool.poetry.extras] api = ["connexion", "Flask", "Flask-Cors", "Jinja2", "pyopenssl"] @@ -94,12 +93,6 @@ pylint = "^2.16.1" pytest-xdist = "^3.5.0" pre-commit = "^3.6.2" -[tool.poetry.group.aws] -optional = true - -[tool.poetry.group.aws.dependencies] - - [tool.black] line-length = 88 include = '\.pyi?$' From a84521d96c0bd3c2b3657320c6a58d8a74c60074 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 15:08:10 -0400 Subject: [PATCH 053/110] handle exception in schematic api --- schematic_api/api/__init__.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/schematic_api/api/__init__.py b/schematic_api/api/__init__.py index 82bad7e9b..8a15312f3 100644 --- a/schematic_api/api/__init__.py +++ b/schematic_api/api/__init__.py @@ -3,6 +3,8 @@ import connexion from schematic import CONFIG +import jsonify +import traceback def create_app(): connexionapp = connexion.FlaskApp(__name__, specification_dir="openapi/") @@ -20,15 +22,19 @@ def create_app(): app.config["SCHEMATIC_CONFIG"] = schematic_config app.config["SCHEMATIC_CONFIG_CONTENT"] = schematic_config_content - # Configure flask app - # app.config[] = schematic[] - # app.config[] = schematic[] - # app.config[] = schematic[] + # handle exceptions in schematic when an exception gets raised + @app.errorhandler(Exception) + def handle_exception(e): + # Ensure the application context is available + with app.app_context(): + # Get the last line of error from the traceback + last_line = traceback.format_exc().strip().split('\n')[-1] - # Initialize extension schematic - # import MyExtension - # myext = MyExtension() - # myext.init_app(app) + # Log the full trace + app.logger.error(traceback.format_exc()) + + # Return a JSON response with the last line of the error + return last_line, 500 return app From c05a1a6d295e6c9f6596fc6941b0ce25be469a16 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 15:15:28 -0400 Subject: [PATCH 054/110] add docstring --- schematic_api/api/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/schematic_api/api/__init__.py b/schematic_api/api/__init__.py index 8a15312f3..4c228f772 100644 --- a/schematic_api/api/__init__.py +++ b/schematic_api/api/__init__.py @@ -25,12 +25,14 @@ def create_app(): # handle exceptions in schematic when an exception gets raised @app.errorhandler(Exception) def handle_exception(e): + """handle exceptions in schematic APIs + """ # Ensure the application context is available with app.app_context(): # Get the last line of error from the traceback last_line = traceback.format_exc().strip().split('\n')[-1] - # Log the full trace + # Log the full trace app.logger.error(traceback.format_exc()) # Return a JSON response with the last line of the error From 36687c20f1ae2c87dd9f068dfa26e79bcd7e2307 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 15:20:56 -0400 Subject: [PATCH 055/110] remove unused import --- schematic_api/api/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/schematic_api/api/__init__.py b/schematic_api/api/__init__.py index 4c228f772..91009a7d2 100644 --- a/schematic_api/api/__init__.py +++ b/schematic_api/api/__init__.py @@ -2,8 +2,6 @@ import connexion -from schematic import CONFIG -import jsonify import traceback def create_app(): From 1d910f7adc3be257f1b00e40a5696f2187875cef Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 15:33:11 -0400 Subject: [PATCH 056/110] raise synapse authentication errors correctly --- schematic_api/api/__init__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/schematic_api/api/__init__.py b/schematic_api/api/__init__.py index 91009a7d2..d6dd593d1 100644 --- a/schematic_api/api/__init__.py +++ b/schematic_api/api/__init__.py @@ -3,6 +3,9 @@ import connexion import traceback +from synapseclient.core.exceptions import ( + SynapseAuthenticationError, +) def create_app(): connexionapp = connexion.FlaskApp(__name__, specification_dir="openapi/") @@ -29,12 +32,16 @@ def handle_exception(e): with app.app_context(): # Get the last line of error from the traceback last_line = traceback.format_exc().strip().split('\n')[-1] - + # Log the full trace app.logger.error(traceback.format_exc()) # Return a JSON response with the last line of the error return last_line, 500 + + @app.errorhandler(SynapseAuthenticationError) + def handle_synapse_auth_error(e): + return str(e), 401 return app From 63e1bb211192a1fc126ba442679e751b9e3d9f0a Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 15:41:29 -0400 Subject: [PATCH 057/110] handle synapse access error --- schematic_api/api/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/schematic_api/api/__init__.py b/schematic_api/api/__init__.py index d6dd593d1..dde18372f 100644 --- a/schematic_api/api/__init__.py +++ b/schematic_api/api/__init__.py @@ -6,6 +6,7 @@ from synapseclient.core.exceptions import ( SynapseAuthenticationError, ) +from schematic.exceptions import AccessCredentialsError def create_app(): connexionapp = connexion.FlaskApp(__name__, specification_dir="openapi/") @@ -32,7 +33,7 @@ def handle_exception(e): with app.app_context(): # Get the last line of error from the traceback last_line = traceback.format_exc().strip().split('\n')[-1] - + # Log the full trace app.logger.error(traceback.format_exc()) @@ -42,6 +43,10 @@ def handle_exception(e): @app.errorhandler(SynapseAuthenticationError) def handle_synapse_auth_error(e): return str(e), 401 + + @app.errorhandler(AccessCredentialsError) + def handle_synapse_access_error(e): + return str(e), 403 return app From 093a48af287455ba964ebcd1747bd9128909d716 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 15:42:06 -0400 Subject: [PATCH 058/110] run black --- schematic_api/api/__init__.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/schematic_api/api/__init__.py b/schematic_api/api/__init__.py index dde18372f..3b6116c99 100644 --- a/schematic_api/api/__init__.py +++ b/schematic_api/api/__init__.py @@ -8,10 +8,12 @@ ) from schematic.exceptions import AccessCredentialsError + def create_app(): connexionapp = connexion.FlaskApp(__name__, specification_dir="openapi/") - connexionapp.add_api("api.yaml", arguments={"title": "Schematic REST API"}, pythonic_params=True) - + connexionapp.add_api( + "api.yaml", arguments={"title": "Schematic REST API"}, pythonic_params=True + ) # get the underlying Flask app instance app = connexionapp.app @@ -27,33 +29,33 @@ def create_app(): # handle exceptions in schematic when an exception gets raised @app.errorhandler(Exception) def handle_exception(e): - """handle exceptions in schematic APIs - """ + """handle exceptions in schematic APIs""" # Ensure the application context is available with app.app_context(): # Get the last line of error from the traceback - last_line = traceback.format_exc().strip().split('\n')[-1] + last_line = traceback.format_exc().strip().split("\n")[-1] # Log the full trace app.logger.error(traceback.format_exc()) # Return a JSON response with the last line of the error return last_line, 500 - + @app.errorhandler(SynapseAuthenticationError) def handle_synapse_auth_error(e): - return str(e), 401 - + return str(e), 401 + @app.errorhandler(AccessCredentialsError) def handle_synapse_access_error(e): - return str(e), 403 + return str(e), 403 return app + app = create_app() # def route_code(): # import flask_schematic as sc # sc.method1() -# \ No newline at end of file +# From a869b16fc5b92d66d891c694f8851ef8c8e3c451 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 15:46:06 -0400 Subject: [PATCH 059/110] add docstring --- schematic_api/api/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/schematic_api/api/__init__.py b/schematic_api/api/__init__.py index 3b6116c99..33355a27d 100644 --- a/schematic_api/api/__init__.py +++ b/schematic_api/api/__init__.py @@ -43,10 +43,12 @@ def handle_exception(e): @app.errorhandler(SynapseAuthenticationError) def handle_synapse_auth_error(e): + """handle synapse authentication error""" return str(e), 401 @app.errorhandler(AccessCredentialsError) def handle_synapse_access_error(e): + """handle synapse access error""" return str(e), 403 return app From 9a9d372fcd7110d7b96f0fa63b87eff2f4c82e8b Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 15:51:45 -0400 Subject: [PATCH 060/110] add type hinting --- schematic_api/api/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/schematic_api/api/__init__.py b/schematic_api/api/__init__.py index 33355a27d..baefba212 100644 --- a/schematic_api/api/__init__.py +++ b/schematic_api/api/__init__.py @@ -1,6 +1,7 @@ import os import connexion +from typing import Tuple import traceback from synapseclient.core.exceptions import ( @@ -42,12 +43,12 @@ def handle_exception(e): return last_line, 500 @app.errorhandler(SynapseAuthenticationError) - def handle_synapse_auth_error(e): + def handle_synapse_auth_error(e: Exception) -> Tuple[str, int]: """handle synapse authentication error""" return str(e), 401 @app.errorhandler(AccessCredentialsError) - def handle_synapse_access_error(e): + def handle_synapse_access_error(e: Exception) -> Tuple[str, int]: """handle synapse access error""" return str(e), 403 From c61bc7d4a22cfc26abf5307cea9ff991bc7a8a24 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 15:55:27 -0400 Subject: [PATCH 061/110] add type hinting --- schematic_api/api/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic_api/api/__init__.py b/schematic_api/api/__init__.py index baefba212..342e33abc 100644 --- a/schematic_api/api/__init__.py +++ b/schematic_api/api/__init__.py @@ -29,7 +29,7 @@ def create_app(): # handle exceptions in schematic when an exception gets raised @app.errorhandler(Exception) - def handle_exception(e): + def handle_exception(e: Exception) -> Tuple[str, int]: """handle exceptions in schematic APIs""" # Ensure the application context is available with app.app_context(): From 7c432e1f75347978d1c089d0aed94d386ba57cd2 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 16:45:54 -0400 Subject: [PATCH 062/110] add test --- tests/test_api.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_api.py b/tests/test_api.py index a070297ab..d99911462 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -791,6 +791,19 @@ def test_generate_manifest_not_file_based_with_annotations( ] ) + def test_generate_manifest_data_type_not_found(self, client, data_model_jsonld): + params = { + "schema_url": data_model_jsonld, + "data_type": "wrong data type", + "use_annotations": False, + } + response = client.get( + "http://localhost:3001/v1/manifest/generate", query_string=params + ) + + assert response.status_code == 500 + assert "LookupError" in str(response.data) + def test_populate_manifest(self, client, data_model_jsonld, test_manifest_csv): # test manifest test_manifest_data = open(test_manifest_csv, "rb") From 9f6c52636ad8ed9d81e06fa3db7f17915b4d44d7 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 16:56:39 -0400 Subject: [PATCH 063/110] add tests --- tests/test_api.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_api.py b/tests/test_api.py index d99911462..15c6786e7 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -132,8 +132,30 @@ def request_headers(syn_token): yield headers +@pytest.fixture +def request_invalid_headers(): + headers = {"Authorization": "Bearer invalid headers"} + yield headers + + @pytest.mark.schematic_api class TestSynapseStorage: + def test_invalid_authentication(self, client, request_invalid_headers): + response = client.get( + "http://localhost:3001/v1/storage/assets/tables", + query_string = {"asset_view": "syn23643253", "return_type": "csv"}, + headers=request_invalid_headers, + ) + assert response.status_code == 401 + + def test_insufficent_auth(self, client, request_headers): + response = client.get( + "http://localhost:3001/v1/storage/assets/tables", + query_string = {"asset_view": "syn23643252", "return_type": "csv"}, + headers=request_headers, + ) + assert response.status_code == 403 + @pytest.mark.synapse_credentials_needed @pytest.mark.parametrize("return_type", ["json", "csv"]) def test_get_storage_assets_tables(self, client, return_type, request_headers): From eddaf6ddec077dc7405f9567ae519ee9d6436083 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 18:01:19 -0400 Subject: [PATCH 064/110] revert changes in toml --- pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 1e00f3730..03b88c81e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,6 +93,12 @@ pylint = "^2.16.1" pytest-xdist = "^3.5.0" pre-commit = "^3.6.2" +[tool.poetry.group.aws] +optional = true + +[tool.poetry.group.aws.dependencies] + + [tool.black] line-length = 88 include = '\.pyi?$' From f07256f2cae83a52934a8717b9db0d68a3366e38 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 6 Jun 2024 18:02:58 -0400 Subject: [PATCH 065/110] revert changes of lock file --- poetry.lock | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index a82fc529c..49550859a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3350,6 +3350,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -3357,8 +3358,16 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -3375,6 +3384,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -3382,6 +3392,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -4851,4 +4862,4 @@ aws = ["uWSGI"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.11" -content-hash = "c380beeada202005657c611f9035bb7e7f7ffba721ff2d893d0f890a6d0b225e" +content-hash = "5bf0c831977694ea541db24481181ec1980ec9589a2adbd9f30ed0fe7f2b2742" From 6fed6808814ff2c5b82ecf4ab1e56db060aad7a8 Mon Sep 17 00:00:00 2001 From: linglp Date: Fri, 7 Jun 2024 17:02:05 -0400 Subject: [PATCH 066/110] add asyncio --- poetry.lock | 26 ++++++++++---------- pyproject.toml | 1 + schematic/store/synapse.py | 49 +++++++++++++++++++++++++++++++------- 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/poetry.lock b/poetry.lock index 49550859a..46ff137d6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -190,6 +190,19 @@ files = [ [package.dependencies] typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} +[[package]] +name = "asyncio" +version = "3.4.3" +description = "reference implementation of PEP 3156" +optional = false +python-versions = "*" +files = [ + {file = "asyncio-3.4.3-cp33-none-win32.whl", hash = "sha256:b62c9157d36187eca799c378e572c969f0da87cd5fc42ca372d92cdb06e7e1de"}, + {file = "asyncio-3.4.3-cp33-none-win_amd64.whl", hash = "sha256:c46a87b48213d7464f22d9a497b9eef8c1928b68320a2fa94240f969f6fec08c"}, + {file = "asyncio-3.4.3-py3-none-any.whl", hash = "sha256:c4d18b22701821de07bd6aea8b53d21449ec0ec5680645e5317062ea21817d2d"}, + {file = "asyncio-3.4.3.tar.gz", hash = "sha256:83360ff8bc97980e4ff25c964c7bd3923d333d177aa4f7fb736b019f26c7cb41"}, +] + [[package]] name = "asyncio-atexit" version = "1.0.1" @@ -3350,7 +3363,6 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, - {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -3358,16 +3370,8 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, - {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, - {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, - {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, - {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -3384,7 +3388,6 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, - {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -3392,7 +3395,6 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, - {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -4862,4 +4864,4 @@ aws = ["uWSGI"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.11" -content-hash = "5bf0c831977694ea541db24481181ec1980ec9589a2adbd9f30ed0fe7f2b2742" +content-hash = "fafb9420f9ac503b9e3a2f1c25321ea8eb59332eb82f01b418f7096def4488c5" diff --git a/pyproject.toml b/pyproject.toml index 03b88c81e..87f203b1d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,7 @@ Flask = {version = "2.1.3", optional = true} Flask-Cors = {version = "^3.0.10", optional = true} uWSGI = {version = "^2.0.21", optional = true} Jinja2 = {version = ">2.11.3", optional = true} +asyncio = "^3.4.3" [tool.poetry.extras] api = ["connexion", "Flask", "Flask-Cors", "Jinja2", "pyopenssl"] diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index a15137ae8..b000833c3 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -68,6 +68,8 @@ from schematic.store.base import BaseStorage from schematic.exceptions import AccessCredentialsError from schematic.configuration.configuration import CONFIG +from synapseclient.annotations import Annotations +import asyncio logger = logging.getLogger("Synapse storage") @@ -1634,6 +1636,33 @@ def _add_annotations( self.syn.set_annotations(annos) return + async def _add_annotations_async( + self, + dmge: DataModelGraphExplorer, + row: pd.Series, + entityId: str, + hideBlanks: bool, + annotation_keys: str, + ) -> None: + """add annotations to entity ids in an asynchronous way + + Args: + dmge: DataModelGraphExplorer object, + row: current row of manifest being processed + entityId (str): synapseId of entity to add annotations to + hideBlanks: Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. + annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display + name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain + display label formatting while ensuring the label is formatted properly for Synapse annotations. + """ + # Format annotations for Synapse + annos = self.format_row_annotations( + dmge, row, entityId, hideBlanks, annotation_keys + ) + + if annos: + await annos.store_async() + def _create_entity_id(self, idx, row, manifest, datasetId): """Helper function to generate an entityId and add it to the appropriate row in the manifest. Args: @@ -1653,7 +1682,7 @@ def _create_entity_id(self, idx, row, manifest, datasetId): manifest.loc[idx, "entityId"] = entityId return manifest, entityId - def add_annotations_to_entities_files( + async def add_annotations_to_entities_files( self, dmge, manifest, @@ -1694,6 +1723,7 @@ def add_annotations_to_entities_files( ).drop("entityId_x", axis=1) # Fill `entityId` for each row if missing and annotate entity as appropriate + set_annotations_requests=[] for idx, row in manifest.iterrows(): if not row["entityId"] and ( manifest_record_type == "file_and_entities" @@ -1713,8 +1743,11 @@ def add_annotations_to_entities_files( # Adding annotations to connected files. if entityId: - self._add_annotations(dmge, row, entityId, hideBlanks, annotation_keys) + #self._add_annotations(dmge, row, entityId, hideBlanks, annotation_keys) + set_annotations_requests.append(asyncio.create_task(self._add_annotations_async(dmge, row, entityId, hideBlanks, annotation_keys))) logger.info(f"Added annotations to entity: {entityId}") + # execute all requests of setting annotations + responses = await asyncio.gather(*set_annotations_requests, return_exceptions=True) return manifest def upload_manifest_as_table( @@ -1767,7 +1800,7 @@ def upload_manifest_as_table( ) if file_annotations_upload: - manifest = self.add_annotations_to_entities_files( + manifest = asyncio.run(self.add_annotations_to_entities_files( dmge, manifest, manifest_record_type, @@ -1775,7 +1808,7 @@ def upload_manifest_as_table( hideBlanks, manifest_synapse_table_id, annotation_keys, - ) + )) # Load manifest to synapse as a CSV File manifest_synapse_file_id = self.upload_manifest_file( manifest, @@ -1840,14 +1873,14 @@ def upload_manifest_as_csv( manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. """ if file_annotations_upload: - manifest = self.add_annotations_to_entities_files( + manifest = asyncio.run(self.add_annotations_to_entities_files( dmge, manifest, manifest_record_type, datasetId, hideBlanks, annotation_keys=annotation_keys, - ) + )) # Load manifest to synapse as a CSV File manifest_synapse_file_id = self.upload_manifest_file( @@ -1917,7 +1950,7 @@ def upload_manifest_combo( ) if file_annotations_upload: - manifest = self.add_annotations_to_entities_files( + manifest = asyncio.run(self.add_annotations_to_entities_files( dmge, manifest, manifest_record_type, @@ -1925,7 +1958,7 @@ def upload_manifest_combo( hideBlanks, manifest_synapse_table_id, annotation_keys=annotation_keys, - ) + )) # Load manifest to synapse as a CSV File manifest_synapse_file_id = self.upload_manifest_file( From ce840e0e348924801e3bc61405824e13c78b3133 Mon Sep 17 00:00:00 2001 From: linglp Date: Mon, 10 Jun 2024 11:24:54 -0400 Subject: [PATCH 067/110] temp --- schematic/store/synapse.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index b000833c3..735479cab 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -68,7 +68,7 @@ from schematic.store.base import BaseStorage from schematic.exceptions import AccessCredentialsError from schematic.configuration.configuration import CONFIG -from synapseclient.annotations import Annotations +from synapseclient.models.annotations import Annotations import asyncio logger = logging.getLogger("Synapse storage") @@ -702,7 +702,6 @@ def fill_in_entity_id_filename( new_files = self._get_file_entityIds( dataset_files=dataset_files, only_new_files=True, manifest=manifest ) - # update manifest so that it contains new dataset files new_files = pd.DataFrame(new_files) manifest = ( @@ -1370,7 +1369,16 @@ def format_row_annotations( metadataSyn[keySyn] = v # set annotation(s) for the various objects/items in a dataset on Synapse + print('entity id to get annotations', entityId) annos = self.syn.get_annotations(entityId) + annos_new = Annotations.from_dict(synapse_annotations=annos) + print('annos', annos) + print('annos new', annos_new) + + print(type(annos_new)) + breakpoint() + + csv_list_regex = comma_separated_list_regex() for anno_k, anno_v in metadataSyn.items(): # Remove keys with nan or empty string values from dict of annotations to be uploaded @@ -1748,6 +1756,10 @@ async def add_annotations_to_entities_files( logger.info(f"Added annotations to entity: {entityId}") # execute all requests of setting annotations responses = await asyncio.gather(*set_annotations_requests, return_exceptions=True) + + # handle errors + for response in responses: + print('repsonse', response) return manifest def upload_manifest_as_table( From bf8d8d4707b94ddcbc12b4fd7f7ae6348def8a94 Mon Sep 17 00:00:00 2001 From: linglp Date: Mon, 10 Jun 2024 15:36:34 -0400 Subject: [PATCH 068/110] store annotations async --- schematic/store/synapse.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 735479cab..e62294861 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -1369,16 +1369,7 @@ def format_row_annotations( metadataSyn[keySyn] = v # set annotation(s) for the various objects/items in a dataset on Synapse - print('entity id to get annotations', entityId) annos = self.syn.get_annotations(entityId) - annos_new = Annotations.from_dict(synapse_annotations=annos) - print('annos', annos) - print('annos new', annos_new) - - print(type(annos_new)) - breakpoint() - - csv_list_regex = comma_separated_list_regex() for anno_k, anno_v in metadataSyn.items(): # Remove keys with nan or empty string values from dict of annotations to be uploaded @@ -1667,9 +1658,9 @@ async def _add_annotations_async( annos = self.format_row_annotations( dmge, row, entityId, hideBlanks, annotation_keys ) - if annos: - await annos.store_async() + annotation_class=Annotations(annotations=dict(annos), id=annos.id, etag=annos.etag) + await annotation_class.store_async(synapse_client=self.syn) def _create_entity_id(self, idx, row, manifest, datasetId): """Helper function to generate an entityId and add it to the appropriate row in the manifest. @@ -1759,7 +1750,8 @@ async def add_annotations_to_entities_files( # handle errors for response in responses: - print('repsonse', response) + if response: + logger.error(response) return manifest def upload_manifest_as_table( From 02ff5e927e985354f4559b6ef0fc9fdd9a244f0a Mon Sep 17 00:00:00 2001 From: linglp Date: Tue, 11 Jun 2024 12:00:53 -0400 Subject: [PATCH 069/110] add and store annos in an async way --- schematic/store/synapse.py | 89 +++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 36 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index e62294861..d3971d872 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -11,6 +11,7 @@ import secrets import shutil import synapseclient +from synapseclient.api import get_entity_id_bundle2 import uuid # used to generate unique names for entities from tenacity import ( @@ -1335,10 +1336,26 @@ def upload_manifest_file( ) return manifest_synapse_file_id + + async def get_async_annotation(self, synapse_id): + return await get_entity_id_bundle2( + entity_id=synapse_id, request={"includeAnnotations": True}, synapse_client=self.syn + ) + + async def store_async_annotation(self, annotation_dict) -> Annotations: + annotation_data = Annotations.from_dict( + synapse_annotations=annotation_dict["annotations"]["annotations"] + ) + annotation_class = Annotations( + annotations=annotation_data, + etag=annotation_dict["annotations"]["etag"], + id=annotation_dict["annotations"]["id"], + ) + return await annotation_class.store_async(self.syn) @missing_entity_handler def format_row_annotations( - self, dmge, row, entityId: str, hideBlanks: bool, annotation_keys: str + self, dmge, row, annos: dict, hideBlanks: bool, annotation_keys: str ): # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis) # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest @@ -1369,7 +1386,6 @@ def format_row_annotations( metadataSyn[keySyn] = v # set annotation(s) for the various objects/items in a dataset on Synapse - annos = self.syn.get_annotations(entityId) csv_list_regex = comma_separated_list_regex() for anno_k, anno_v in metadataSyn.items(): # Remove keys with nan or empty string values from dict of annotations to be uploaded @@ -1635,32 +1651,42 @@ def _add_annotations( self.syn.set_annotations(annos) return - async def _add_annotations_async( - self, - dmge: DataModelGraphExplorer, - row: pd.Series, - entityId: str, - hideBlanks: bool, - annotation_keys: str, - ) -> None: - """add annotations to entity ids in an asynchronous way + async def _get_store_annotations_async(self, entityId:str, dmge: DataModelGraphExplorer, row:pd.Series, hideBlanks:bool, annotation_keys:str) -> None: + """store annotations in an async way Args: - dmge: DataModelGraphExplorer object, - row: current row of manifest being processed - entityId (str): synapseId of entity to add annotations to - hideBlanks: Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. - annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display - name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain - display label formatting while ensuring the label is formatted properly for Synapse annotations. - """ - # Format annotations for Synapse - annos = self.format_row_annotations( - dmge, row, entityId, hideBlanks, annotation_keys - ) - if annos: - annotation_class=Annotations(annotations=dict(annos), id=annos.id, etag=annos.etag) - await annotation_class.store_async(synapse_client=self.syn) + entityId (synapse entity id): synapse entity id + dmge (DataModelGraphExplorer): data model graph explorer + row (pd.Series): pandas series + hideBlanks (bool): if true, does not upload annotation keys with blank values. If false, Uploads Annotation keys with empty string values. + annotation_keys (str): annotation keys, default to "class_label" + """ + # get annotations asynchronously + requests = set() + get_annos = asyncio.create_task(self.get_async_annotation(entityId)) + requests.add(get_annos) + + while requests: + done_tasks, pending_tasks = await asyncio.wait( + requests, return_when=asyncio.FIRST_COMPLETED + ) + requests = pending_tasks + # after the task of getting annotation gets completed, + # store annotations + for completed_task in done_tasks: + try: + annos = completed_task.result() + + if isinstance(annos, Annotations): + logger.info("Successfully stored annotations: {annos}") + else: + # remove special characters in annotations + annos = self.format_row_annotations(dmge, row, annos, hideBlanks, annotation_keys) + requests.add( + asyncio.create_task(self.store_async_annotation(annotation_dict=annos)) + ) + except Exception as e: + logger.error(f"failed with { repr(e) }.") def _create_entity_id(self, idx, row, manifest, datasetId): """Helper function to generate an entityId and add it to the appropriate row in the manifest. @@ -1722,7 +1748,6 @@ async def add_annotations_to_entities_files( ).drop("entityId_x", axis=1) # Fill `entityId` for each row if missing and annotate entity as appropriate - set_annotations_requests=[] for idx, row in manifest.iterrows(): if not row["entityId"] and ( manifest_record_type == "file_and_entities" @@ -1742,16 +1767,8 @@ async def add_annotations_to_entities_files( # Adding annotations to connected files. if entityId: - #self._add_annotations(dmge, row, entityId, hideBlanks, annotation_keys) - set_annotations_requests.append(asyncio.create_task(self._add_annotations_async(dmge, row, entityId, hideBlanks, annotation_keys))) + await self._get_store_annotations_async(entityId=entityId, dmge=dmge, row=row, hideBlanks=hideBlanks, annotation_keys=annotation_keys) logger.info(f"Added annotations to entity: {entityId}") - # execute all requests of setting annotations - responses = await asyncio.gather(*set_annotations_requests, return_exceptions=True) - - # handle errors - for response in responses: - if response: - logger.error(response) return manifest def upload_manifest_as_table( From 1fb602431aed6218a80716b4a0baf3e57c266f4d Mon Sep 17 00:00:00 2001 From: linglp Date: Tue, 11 Jun 2024 12:13:44 -0400 Subject: [PATCH 070/110] add type hinting --- schematic/store/synapse.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index d3971d872..e5a059139 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -24,7 +24,7 @@ from time import sleep # allows specifying explicit variable types -from typing import Dict, List, Tuple, Sequence, Union, Optional +from typing import Dict, List, Tuple, Sequence, Union, Optional, Any from synapseclient import ( Synapse, @@ -1337,12 +1337,29 @@ def upload_manifest_file( return manifest_synapse_file_id - async def get_async_annotation(self, synapse_id): + async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]: + """get annotations asynchronously + + Args: + synapse_id (str): synapse id of the entity that the annotation belongs + + Returns: + Dict[str, Any]: The requested entity bundle matching + + """ return await get_entity_id_bundle2( entity_id=synapse_id, request={"includeAnnotations": True}, synapse_client=self.syn ) - async def store_async_annotation(self, annotation_dict) -> Annotations: + async def store_async_annotation(self, annotation_dict: dict) -> Annotations: + """store annotation in an async way + + Args: + annotation_dict (dict): annotation in a dictionary format + + Returns: + Annotations: The stored annotations. + """ annotation_data = Annotations.from_dict( synapse_annotations=annotation_dict["annotations"]["annotations"] ) From 91336c9e61d624b078068cd03b9b7e99728f40f1 Mon Sep 17 00:00:00 2001 From: linglp Date: Tue, 11 Jun 2024 12:16:54 -0400 Subject: [PATCH 071/110] run black --- schematic/store/synapse.py | 105 +++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 40 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index e5a059139..53cf2b7bb 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -1336,21 +1336,23 @@ def upload_manifest_file( ) return manifest_synapse_file_id - + async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]: """get annotations asynchronously Args: - synapse_id (str): synapse id of the entity that the annotation belongs + synapse_id (str): synapse id of the entity that the annotation belongs Returns: Dict[str, Any]: The requested entity bundle matching """ return await get_entity_id_bundle2( - entity_id=synapse_id, request={"includeAnnotations": True}, synapse_client=self.syn + entity_id=synapse_id, + request={"includeAnnotations": True}, + synapse_client=self.syn, ) - + async def store_async_annotation(self, annotation_dict: dict) -> Annotations: """store annotation in an async way @@ -1368,7 +1370,7 @@ async def store_async_annotation(self, annotation_dict: dict) -> Annotations: etag=annotation_dict["annotations"]["etag"], id=annotation_dict["annotations"]["id"], ) - return await annotation_class.store_async(self.syn) + return await annotation_class.store_async(self.syn) @missing_entity_handler def format_row_annotations( @@ -1668,17 +1670,24 @@ def _add_annotations( self.syn.set_annotations(annos) return - async def _get_store_annotations_async(self, entityId:str, dmge: DataModelGraphExplorer, row:pd.Series, hideBlanks:bool, annotation_keys:str) -> None: + async def _get_store_annotations_async( + self, + entityId: str, + dmge: DataModelGraphExplorer, + row: pd.Series, + hideBlanks: bool, + annotation_keys: str, + ) -> None: """store annotations in an async way Args: entityId (synapse entity id): synapse entity id dmge (DataModelGraphExplorer): data model graph explorer - row (pd.Series): pandas series - hideBlanks (bool): if true, does not upload annotation keys with blank values. If false, Uploads Annotation keys with empty string values. + row (pd.Series): pandas series + hideBlanks (bool): if true, does not upload annotation keys with blank values. If false, Uploads Annotation keys with empty string values. annotation_keys (str): annotation keys, default to "class_label" """ - # get annotations asynchronously + # get annotations asynchronously requests = set() get_annos = asyncio.create_task(self.get_async_annotation(entityId)) requests.add(get_annos) @@ -1688,7 +1697,7 @@ async def _get_store_annotations_async(self, entityId:str, dmge: DataModelGraphE requests, return_when=asyncio.FIRST_COMPLETED ) requests = pending_tasks - # after the task of getting annotation gets completed, + # after the task of getting annotation gets completed, # store annotations for completed_task in done_tasks: try: @@ -1698,12 +1707,16 @@ async def _get_store_annotations_async(self, entityId:str, dmge: DataModelGraphE logger.info("Successfully stored annotations: {annos}") else: # remove special characters in annotations - annos = self.format_row_annotations(dmge, row, annos, hideBlanks, annotation_keys) + annos = self.format_row_annotations( + dmge, row, annos, hideBlanks, annotation_keys + ) requests.add( - asyncio.create_task(self.store_async_annotation(annotation_dict=annos)) + asyncio.create_task( + self.store_async_annotation(annotation_dict=annos) + ) ) except Exception as e: - logger.error(f"failed with { repr(e) }.") + logger.error(f"failed with { repr(e) }.") def _create_entity_id(self, idx, row, manifest, datasetId): """Helper function to generate an entityId and add it to the appropriate row in the manifest. @@ -1784,7 +1797,13 @@ async def add_annotations_to_entities_files( # Adding annotations to connected files. if entityId: - await self._get_store_annotations_async(entityId=entityId, dmge=dmge, row=row, hideBlanks=hideBlanks, annotation_keys=annotation_keys) + await self._get_store_annotations_async( + entityId=entityId, + dmge=dmge, + row=row, + hideBlanks=hideBlanks, + annotation_keys=annotation_keys, + ) logger.info(f"Added annotations to entity: {entityId}") return manifest @@ -1838,15 +1857,17 @@ def upload_manifest_as_table( ) if file_annotations_upload: - manifest = asyncio.run(self.add_annotations_to_entities_files( - dmge, - manifest, - manifest_record_type, - datasetId, - hideBlanks, - manifest_synapse_table_id, - annotation_keys, - )) + manifest = asyncio.run( + self.add_annotations_to_entities_files( + dmge, + manifest, + manifest_record_type, + datasetId, + hideBlanks, + manifest_synapse_table_id, + annotation_keys, + ) + ) # Load manifest to synapse as a CSV File manifest_synapse_file_id = self.upload_manifest_file( manifest, @@ -1911,14 +1932,16 @@ def upload_manifest_as_csv( manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. """ if file_annotations_upload: - manifest = asyncio.run(self.add_annotations_to_entities_files( - dmge, - manifest, - manifest_record_type, - datasetId, - hideBlanks, - annotation_keys=annotation_keys, - )) + manifest = asyncio.run( + self.add_annotations_to_entities_files( + dmge, + manifest, + manifest_record_type, + datasetId, + hideBlanks, + annotation_keys=annotation_keys, + ) + ) # Load manifest to synapse as a CSV File manifest_synapse_file_id = self.upload_manifest_file( @@ -1988,15 +2011,17 @@ def upload_manifest_combo( ) if file_annotations_upload: - manifest = asyncio.run(self.add_annotations_to_entities_files( - dmge, - manifest, - manifest_record_type, - datasetId, - hideBlanks, - manifest_synapse_table_id, - annotation_keys=annotation_keys, - )) + manifest = asyncio.run( + self.add_annotations_to_entities_files( + dmge, + manifest, + manifest_record_type, + datasetId, + hideBlanks, + manifest_synapse_table_id, + annotation_keys=annotation_keys, + ) + ) # Load manifest to synapse as a CSV File manifest_synapse_file_id = self.upload_manifest_file( From d6873cf74f0dfd5877333ef5a2b89c57df7fa94c Mon Sep 17 00:00:00 2001 From: linglp Date: Tue, 11 Jun 2024 12:44:04 -0400 Subject: [PATCH 072/110] remove unused package --- schematic_api/api/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/schematic_api/api/__init__.py b/schematic_api/api/__init__.py index b04ffada0..01d2bbb36 100644 --- a/schematic_api/api/__init__.py +++ b/schematic_api/api/__init__.py @@ -5,8 +5,6 @@ from schematic import CONFIG from jaeger_client import Config from flask_opentracing import FlaskTracer -import traceback -import jsonify config = Config( config={ From e62597b4fc42e152e8b644262b21ac112b1ba9bf Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 12 Jun 2024 23:18:21 -0400 Subject: [PATCH 073/110] delete unused funct --- schematic/store/synapse.py | 91 ++++++++++++++------------------------ 1 file changed, 33 insertions(+), 58 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 53cf2b7bb..4a80903f5 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -71,6 +71,7 @@ from schematic.configuration.configuration import CONFIG from synapseclient.models.annotations import Annotations import asyncio +from dataclasses import asdict logger = logging.getLogger("Synapse storage") @@ -1373,8 +1374,8 @@ async def store_async_annotation(self, annotation_dict: dict) -> Annotations: return await annotation_class.store_async(self.syn) @missing_entity_handler - def format_row_annotations( - self, dmge, row, annos: dict, hideBlanks: bool, annotation_keys: str + async def format_row_annotations( + self, dmge, row, entityId: str, hideBlanks: bool, annotation_keys: str ): # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis) # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest @@ -1405,6 +1406,8 @@ def format_row_annotations( metadataSyn[keySyn] = v # set annotation(s) for the various objects/items in a dataset on Synapse + annos = await self.get_async_annotation(entityId) + csv_list_regex = comma_separated_list_regex() for anno_k, anno_v in metadataSyn.items(): # Remove keys with nan or empty string values from dict of annotations to be uploaded @@ -1670,54 +1673,6 @@ def _add_annotations( self.syn.set_annotations(annos) return - async def _get_store_annotations_async( - self, - entityId: str, - dmge: DataModelGraphExplorer, - row: pd.Series, - hideBlanks: bool, - annotation_keys: str, - ) -> None: - """store annotations in an async way - - Args: - entityId (synapse entity id): synapse entity id - dmge (DataModelGraphExplorer): data model graph explorer - row (pd.Series): pandas series - hideBlanks (bool): if true, does not upload annotation keys with blank values. If false, Uploads Annotation keys with empty string values. - annotation_keys (str): annotation keys, default to "class_label" - """ - # get annotations asynchronously - requests = set() - get_annos = asyncio.create_task(self.get_async_annotation(entityId)) - requests.add(get_annos) - - while requests: - done_tasks, pending_tasks = await asyncio.wait( - requests, return_when=asyncio.FIRST_COMPLETED - ) - requests = pending_tasks - # after the task of getting annotation gets completed, - # store annotations - for completed_task in done_tasks: - try: - annos = completed_task.result() - - if isinstance(annos, Annotations): - logger.info("Successfully stored annotations: {annos}") - else: - # remove special characters in annotations - annos = self.format_row_annotations( - dmge, row, annos, hideBlanks, annotation_keys - ) - requests.add( - asyncio.create_task( - self.store_async_annotation(annotation_dict=annos) - ) - ) - except Exception as e: - logger.error(f"failed with { repr(e) }.") - def _create_entity_id(self, idx, row, manifest, datasetId): """Helper function to generate an entityId and add it to the appropriate row in the manifest. Args: @@ -1778,6 +1733,7 @@ async def add_annotations_to_entities_files( ).drop("entityId_x", axis=1) # Fill `entityId` for each row if missing and annotate entity as appropriate + requests=set() for idx, row in manifest.iterrows(): if not row["entityId"] and ( manifest_record_type == "file_and_entities" @@ -1797,14 +1753,33 @@ async def add_annotations_to_entities_files( # Adding annotations to connected files. if entityId: - await self._get_store_annotations_async( - entityId=entityId, - dmge=dmge, - row=row, - hideBlanks=hideBlanks, - annotation_keys=annotation_keys, - ) - logger.info(f"Added annotations to entity: {entityId}") + + # Format annotations for Synapse + annos_task = asyncio.create_task(self.format_row_annotations( + dmge, row, entityId, hideBlanks, annotation_keys + )) + requests.add(annos_task) + + while requests: + done_tasks, pending_tasks = await asyncio.wait(requests, return_when=asyncio.FIRST_COMPLETED) + requests = pending_tasks + + for completed_task in done_tasks: + try: + annos = completed_task.result() + + if isinstance(annos, Annotations): + annos_dict = asdict(annos) + entity_id = annos_dict["id"] + logger.info(f"Successfully stored annotations for {entity_id}") + else: + # remove special characters in annotations + entity_id = annos["EntityId"] + logger.info(f"Got annotations for {entity_id} entity") + requests.add(asyncio.create_task(self.store_async_annotation(annotation_dict=annos))) + + except Exception as e: + raise RuntimeError(f"failed with { repr(e) }.") return manifest def upload_manifest_as_table( From 33c33eccdd04c982ac867ce648d701579782d977 Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 12 Jun 2024 23:35:47 -0400 Subject: [PATCH 074/110] preserve current behavior; add annos --- schematic/store/synapse.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 4a80903f5..ebdbf8de1 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -1775,8 +1775,9 @@ async def add_annotations_to_entities_files( else: # remove special characters in annotations entity_id = annos["EntityId"] - logger.info(f"Got annotations for {entity_id} entity") - requests.add(asyncio.create_task(self.store_async_annotation(annotation_dict=annos))) + logger.info(f"Obtained and processed annotations for {entity_id} entity") + if annos: + requests.add(asyncio.create_task(self.store_async_annotation(annotation_dict=annos))) except Exception as e: raise RuntimeError(f"failed with { repr(e) }.") From a09ea37a981b45fba89fb5db7071c878be5cfa1b Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 13 Jun 2024 07:30:15 -0400 Subject: [PATCH 075/110] run black --- schematic/store/synapse.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index ebdbf8de1..ac18fc1ff 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -1733,7 +1733,7 @@ async def add_annotations_to_entities_files( ).drop("entityId_x", axis=1) # Fill `entityId` for each row if missing and annotate entity as appropriate - requests=set() + requests = set() for idx, row in manifest.iterrows(): if not row["entityId"] and ( manifest_record_type == "file_and_entities" @@ -1753,15 +1753,18 @@ async def add_annotations_to_entities_files( # Adding annotations to connected files. if entityId: - # Format annotations for Synapse - annos_task = asyncio.create_task(self.format_row_annotations( - dmge, row, entityId, hideBlanks, annotation_keys - )) + annos_task = asyncio.create_task( + self.format_row_annotations( + dmge, row, entityId, hideBlanks, annotation_keys + ) + ) requests.add(annos_task) while requests: - done_tasks, pending_tasks = await asyncio.wait(requests, return_when=asyncio.FIRST_COMPLETED) + done_tasks, pending_tasks = await asyncio.wait( + requests, return_when=asyncio.FIRST_COMPLETED + ) requests = pending_tasks for completed_task in done_tasks: @@ -1771,13 +1774,23 @@ async def add_annotations_to_entities_files( if isinstance(annos, Annotations): annos_dict = asdict(annos) entity_id = annos_dict["id"] - logger.info(f"Successfully stored annotations for {entity_id}") + logger.info( + f"Successfully stored annotations for {entity_id}" + ) else: # remove special characters in annotations entity_id = annos["EntityId"] - logger.info(f"Obtained and processed annotations for {entity_id} entity") + logger.info( + f"Obtained and processed annotations for {entity_id} entity" + ) if annos: - requests.add(asyncio.create_task(self.store_async_annotation(annotation_dict=annos))) + requests.add( + asyncio.create_task( + self.store_async_annotation( + annotation_dict=annos + ) + ) + ) except Exception as e: raise RuntimeError(f"failed with { repr(e) }.") From 56a64c31558b5127e19f982db5d5adf56600ac79 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 13 Jun 2024 09:15:53 -0400 Subject: [PATCH 076/110] update to use synapseclient 4.3.0 --- poetry.lock | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 46ff137d6..9295cc34f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4331,13 +4331,13 @@ Jinja2 = ">=2.0" [[package]] name = "synapseclient" -version = "4.2.0" +version = "4.3.0" description = "A client for Synapse, a collaborative, open-source research platform that allows teams to share data, track analyses, and collaborate." optional = false python-versions = ">=3.8" files = [ - {file = "synapseclient-4.2.0-py3-none-any.whl", hash = "sha256:ab5bc9c2bf5b90f271f1a9478eff7e9fca3e573578401ac706383ddb984d7a13"}, - {file = "synapseclient-4.2.0.tar.gz", hash = "sha256:89222661125de1795b1a096cf8c58b8115c19d6b0fa5846ed2a41cdb394ef773"}, + {file = "synapseclient-4.3.0-py3-none-any.whl", hash = "sha256:5d8107cfff4031a0a46d60a3c9a8120300190fa27df4983d883dc951d8bd885f"}, + {file = "synapseclient-4.3.0.tar.gz", hash = "sha256:a1149a64b3281669d42c69e210677a902478b8f6b302966d518473c7384f6387"}, ] [package.dependencies] @@ -4357,11 +4357,11 @@ urllib3 = ">=1.26.18,<2" [package.extras] boto3 = ["boto3 (>=1.7.0,<2.0)"] -dev = ["black", "flake8 (>=3.7.0,<4.0)", "func-timeout (>=4.3,<5.0)", "pre-commit", "pytest (>=6.0.0,<7.0)", "pytest-asyncio (>=0.19,<1.0)", "pytest-cov (>=4.1.0,<4.2.0)", "pytest-mock (>=3.0,<4.0)", "pytest-rerunfailures (>=12.0,<13.0)", "pytest-socket (>=0.6.0,<0.7.0)", "pytest-xdist[psutil] (>=2.2,<3.0.0)"] +dev = ["black", "flake8 (>=3.7.0,<4.0)", "func-timeout (>=4.3,<5.0)", "pandas (>=1.5,<3.0)", "pre-commit", "pytest (>=7.0.0,<8.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-cov (>=4.1.0,<4.2.0)", "pytest-mock (>=3.0,<4.0)", "pytest-rerunfailures (>=12.0,<13.0)", "pytest-socket (>=0.6.0,<0.7.0)", "pytest-xdist[psutil] (>=2.2,<3.0.0)"] docs = ["markdown-include (>=0.8.1,<0.9.0)", "mkdocs (>=1.5.3)", "mkdocs-material (>=9.4.14)", "mkdocs-open-in-new-tab (>=1.0.3,<1.1.0)", "mkdocstrings (>=0.24.0)", "mkdocstrings-python (>=1.7.5)", "termynal (>=0.11.1)"] pandas = ["pandas (>=1.5,<3.0)"] pysftp = ["pysftp (>=0.2.8,<0.3)"] -tests = ["flake8 (>=3.7.0,<4.0)", "func-timeout (>=4.3,<5.0)", "pytest (>=6.0.0,<7.0)", "pytest-asyncio (>=0.19,<1.0)", "pytest-cov (>=4.1.0,<4.2.0)", "pytest-mock (>=3.0,<4.0)", "pytest-rerunfailures (>=12.0,<13.0)", "pytest-socket (>=0.6.0,<0.7.0)", "pytest-xdist[psutil] (>=2.2,<3.0.0)"] +tests = ["flake8 (>=3.7.0,<4.0)", "func-timeout (>=4.3,<5.0)", "pandas (>=1.5,<3.0)", "pytest (>=7.0.0,<8.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-cov (>=4.1.0,<4.2.0)", "pytest-mock (>=3.0,<4.0)", "pytest-rerunfailures (>=12.0,<13.0)", "pytest-socket (>=0.6.0,<0.7.0)", "pytest-xdist[psutil] (>=2.2,<3.0.0)"] [[package]] name = "tabulate" From d32bbbd03b4d58fed0cbbd678f2bda8c86acc1a0 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 13 Jun 2024 09:18:13 -0400 Subject: [PATCH 077/110] update synapseclient veresion --- poetry.lock | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index 9295cc34f..5a2d51e75 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4864,4 +4864,4 @@ aws = ["uWSGI"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.11" -content-hash = "fafb9420f9ac503b9e3a2f1c25321ea8eb59332eb82f01b418f7096def4488c5" +content-hash = "c47e0d03588f80b3f5b9ed8249317e45caf6ddccc4a301c76bcee099151605f2" diff --git a/pyproject.toml b/pyproject.toml index 87f203b1d..150508e81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,7 @@ pygsheets = "^2.0.4" PyYAML = "^6.0.0" rdflib = "^6.0.0" setuptools = "^66.0.0" -synapseclient = "^4.1.0" +synapseclient = "^4.3.0" tenacity = "^8.0.1" toml = "^0.10.2" great-expectations = "^0.15.0" From 16e62d41f8c95a11b0a3f37ecc1bf5e8b4aec0db Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 13 Jun 2024 13:10:26 -0400 Subject: [PATCH 078/110] build separate function;raise error --- schematic/store/synapse.py | 68 ++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index ac18fc1ff..eec31e384 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -1692,6 +1692,40 @@ def _create_entity_id(self, idx, row, manifest, datasetId): manifest.loc[idx, "entityId"] = entityId return manifest, entityId + async def _store_annos(self, requests): + while requests: + done_tasks, pending_tasks = await asyncio.wait( + requests, return_when=asyncio.FIRST_COMPLETED + ) + requests = pending_tasks + + for completed_task in done_tasks: + try: + annos = completed_task.result() + + if isinstance(annos, Annotations): + annos_dict = asdict(annos) + entity_id = annos_dict["id"] + logger.info( + f"Successfully stored annotations for {entity_id}" + ) + else: + # remove special characters in annotations + entity_id = annos["EntityId"] + logger.info( + f"Obtained and processed annotations for {entity_id} entity" + ) + if annos: + requests.add( + asyncio.create_task( + self.store_async_annotation( + annotation_dict=annos + ) + ) + ) + except Exception as e: + raise RuntimeError(f"failed with { repr(e) }.") from e + async def add_annotations_to_entities_files( self, dmge, @@ -1760,40 +1794,8 @@ async def add_annotations_to_entities_files( ) ) requests.add(annos_task) + self._store_annos(requests) - while requests: - done_tasks, pending_tasks = await asyncio.wait( - requests, return_when=asyncio.FIRST_COMPLETED - ) - requests = pending_tasks - - for completed_task in done_tasks: - try: - annos = completed_task.result() - - if isinstance(annos, Annotations): - annos_dict = asdict(annos) - entity_id = annos_dict["id"] - logger.info( - f"Successfully stored annotations for {entity_id}" - ) - else: - # remove special characters in annotations - entity_id = annos["EntityId"] - logger.info( - f"Obtained and processed annotations for {entity_id} entity" - ) - if annos: - requests.add( - asyncio.create_task( - self.store_async_annotation( - annotation_dict=annos - ) - ) - ) - - except Exception as e: - raise RuntimeError(f"failed with { repr(e) }.") return manifest def upload_manifest_as_table( From 38249640148c9019dc42dedb4c332f76dc956adb Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 13 Jun 2024 14:14:08 -0400 Subject: [PATCH 079/110] use pytest asyncio --- poetry.lock | 20 +++++++++++++++++++- pyproject.toml | 1 + 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 5a2d51e75..8761b052f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3193,6 +3193,24 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-asyncio" +version = "0.23.7" +description = "Pytest support for asyncio" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest_asyncio-0.23.7-py3-none-any.whl", hash = "sha256:009b48127fbe44518a547bddd25611551b0e43ccdbf1e67d12479f569832c20b"}, + {file = "pytest_asyncio-0.23.7.tar.gz", hash = "sha256:5f5c72948f4c49e7db4f29f2521d4031f1c27f86e57b046126654083d4770268"}, +] + +[package.dependencies] +pytest = ">=7.0.0,<9" + +[package.extras] +docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"] +testing = ["coverage (>=6.2)", "hypothesis (>=5.7.1)"] + [[package]] name = "pytest-cov" version = "4.1.0" @@ -4864,4 +4882,4 @@ aws = ["uWSGI"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.11" -content-hash = "c47e0d03588f80b3f5b9ed8249317e45caf6ddccc4a301c76bcee099151605f2" +content-hash = "a3048c0808e73fd19f5175897e9dda47a2a593422dd4744886615ac453a42139" diff --git a/pyproject.toml b/pyproject.toml index 150508e81..aa0939b5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,6 +75,7 @@ Flask-Cors = {version = "^3.0.10", optional = true} uWSGI = {version = "^2.0.21", optional = true} Jinja2 = {version = ">2.11.3", optional = true} asyncio = "^3.4.3" +pytest-asyncio = "^0.23.7" [tool.poetry.extras] api = ["connexion", "Flask", "Flask-Cors", "Jinja2", "pyopenssl"] From 1efb1349b6a90098db284f3c07da3b385798a87d Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 13 Jun 2024 17:45:19 -0400 Subject: [PATCH 080/110] add test --- tests/test_store.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_store.py b/tests/test_store.py index 06fa4bf23..88e44e923 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -28,6 +28,7 @@ SynapseStorage ) from schematic.utils.general import check_synapse_cache_size +from unittest.mock import AsyncMock logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) @@ -483,6 +484,20 @@ def test_get_files_metadata_from_dataset(self, synapse_store): "entityId": ["syn123", "syn456"], } + async def test_get_async_annotation(self, synapse_store): + mock_syn_id = "syn1234" + + with patch("schematic.store.synapse.get_entity_id_bundle2", new_callable=AsyncMock, return_value="mock") as mock_get_entity_id_bundle2: + mock_get_entity_id_bundle2.return_value="mock" + result = await synapse_store.get_async_annotation(synapse_id=mock_syn_id) + + mock_get_entity_id_bundle2.assert_called_once_with( + entity_id=mock_syn_id, + request={"includeAnnotations": True}, + synapse_client=synapse_store.syn, + ) + assert result == "mock" + class TestDatasetFileView: def test_init(self, dataset_id, dataset_fileview, synapse_store): From 2e53a3335a72731734455fe70cfd6ceca8ad4f8b Mon Sep 17 00:00:00 2001 From: linglp Date: Fri, 14 Jun 2024 00:11:13 -0400 Subject: [PATCH 081/110] fix store_annos --- schematic/store/synapse.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index eec31e384..91aa602bb 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -1695,8 +1695,8 @@ def _create_entity_id(self, idx, row, manifest, datasetId): async def _store_annos(self, requests): while requests: done_tasks, pending_tasks = await asyncio.wait( - requests, return_when=asyncio.FIRST_COMPLETED - ) + requests, return_when=asyncio.FIRST_COMPLETED + ) requests = pending_tasks for completed_task in done_tasks: @@ -1706,9 +1706,7 @@ async def _store_annos(self, requests): if isinstance(annos, Annotations): annos_dict = asdict(annos) entity_id = annos_dict["id"] - logger.info( - f"Successfully stored annotations for {entity_id}" - ) + logger.info(f"Successfully stored annotations for {entity_id}") else: # remove special characters in annotations entity_id = annos["EntityId"] @@ -1718,9 +1716,7 @@ async def _store_annos(self, requests): if annos: requests.add( asyncio.create_task( - self.store_async_annotation( - annotation_dict=annos - ) + self.store_async_annotation(annotation_dict=annos) ) ) except Exception as e: @@ -1794,8 +1790,7 @@ async def add_annotations_to_entities_files( ) ) requests.add(annos_task) - self._store_annos(requests) - + await self._store_annos(requests) return manifest def upload_manifest_as_table( From f1576696826bfd1f0e27290412227d8f9b6958c9 Mon Sep 17 00:00:00 2001 From: linglp Date: Fri, 14 Jun 2024 06:43:44 -0400 Subject: [PATCH 082/110] add test_get_async_annotation --- tests/test_store.py | 71 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 15 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index 88e44e923..29e91331b 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -22,13 +22,10 @@ from tests.conftest import Helpers from schematic.store.base import BaseStorage -from schematic.store.synapse import ( - DatasetFileView, - ManifestDownload, - SynapseStorage -) +from schematic.store.synapse import DatasetFileView, ManifestDownload, SynapseStorage from schematic.utils.general import check_synapse_cache_size from unittest.mock import AsyncMock +from synapseclient.models import Annotations logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) @@ -132,7 +129,7 @@ def test_init(self): class TestSynapseStorage: "Tests the SynapseStorage class" - def test_init(self, synapse_store:SynapseStorage) -> None: + def test_init(self, synapse_store: SynapseStorage) -> None: """Tests SynapseStorage.__init__""" assert synapse_store.storageFileview == "syn23643253" assert isinstance(synapse_store.storageFileviewTable, pd.DataFrame) @@ -143,8 +140,7 @@ def test__purge_synapse_cache(self) -> None: synapse_store = SynapseStorage(synapse_cache_path="test_cache_dir") size_before_purge = check_synapse_cache_size(synapse_store.root_synapse_cache) synapse_store._purge_synapse_cache( - maximum_storage_allowed_cache_gb=0.000001, - minute_buffer=0 + maximum_storage_allowed_cache_gb=0.000001, minute_buffer=0 ) size_after_purge = check_synapse_cache_size(synapse_store.root_synapse_cache) assert size_before_purge > size_after_purge @@ -158,7 +154,7 @@ def test_login(self) -> None: assert synapse_client.cache.cache_root_dir == "test_cache_dir" shutil.rmtree("test_cache_dir") - def test_getFileAnnotations(self, synapse_store:SynapseStorage) -> None: + def test_getFileAnnotations(self, synapse_store: SynapseStorage) -> None: expected_dict = { "author": "bruno, milen, sujay", "impact": "42.9", @@ -221,17 +217,17 @@ def test_get_file_entityIds(self, helpers, synapse_store, only_new_files): {"CheckInt": "7", "CheckList": "valid, list, values"}, "syn34295552", "file_and_entities", - "annotations_test_manifest.csv" + "annotations_test_manifest.csv", ), ( {"FileFormat": "BAM", "GenomeBuild": "GRCh38"}, "syn39241199", "table_and_file", - "test_BulkRNAseq.csv" + "test_BulkRNAseq.csv", ), ], ids=["non file-based", "file-based"], - indirect=["temporary_file_copy"] + indirect=["temporary_file_copy"], ) def test_annotation_submission( self, @@ -484,11 +480,16 @@ def test_get_files_metadata_from_dataset(self, synapse_store): "entityId": ["syn123", "syn456"], } - async def test_get_async_annotation(self, synapse_store): + async def test_get_async_annotation(self, synapse_store: SynapseStorage) -> None: + """test get annotation async function""" mock_syn_id = "syn1234" - with patch("schematic.store.synapse.get_entity_id_bundle2", new_callable=AsyncMock, return_value="mock") as mock_get_entity_id_bundle2: - mock_get_entity_id_bundle2.return_value="mock" + with patch( + "schematic.store.synapse.get_entity_id_bundle2", + new_callable=AsyncMock, + return_value="mock", + ) as mock_get_entity_id_bundle2: + mock_get_entity_id_bundle2.return_value = "mock" result = await synapse_store.get_async_annotation(synapse_id=mock_syn_id) mock_get_entity_id_bundle2.assert_called_once_with( @@ -498,6 +499,46 @@ async def test_get_async_annotation(self, synapse_store): ) assert result == "mock" + async def test_store_async_annotation(self, synapse_store: SynapseStorage) -> None: + """test store annotations async function""" + annos_dict = { + "annotations": { + "id": "mock_syn_id", + "etag": "mock etag", + "annotations": { + "Id": {"type": "STRING", "value": ["mock value"]}, + "EntityId": {"type": "STRING", "value": ["mock_syn_id"]}, + "SampleID": {"type": "STRING", "value": [""]}, + "Component": {"type": "STRING", "value": ["mock value"]}, + }, + }, + "FileFormat": "mock format", + "Component": "mock component", + "Id": "mock_string", + "EntityId": "mock_id", + } + expected_dict = Annotations( + annotations={ + "Id": ["mock_string"], + "EntityId": ["mock_syn_id"], + "SampleID": [""], + "Component": ["mock value"], + "FileFormat": ["mock_format"], + }, + etag="mock etag", + id="mock syn_id", + ) + + with patch( + "schematic.store.synapse.Annotations.store_async", + new_callable=AsyncMock, + return_value=expected_dict, + ) as mock_store_async: + result = await synapse_store.store_async_annotation(annos_dict) + + mock_store_async.assert_called_once_with(synapse_store.syn) + assert isinstance(result, Annotations) + class TestDatasetFileView: def test_init(self, dataset_id, dataset_fileview, synapse_store): From ca629a2ad3e00b84c1afd3b5bb977d09604ed49a Mon Sep 17 00:00:00 2001 From: linglp Date: Fri, 14 Jun 2024 14:35:25 -0400 Subject: [PATCH 083/110] add isort in pre-commit --- .pre-commit-config.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 623446ced..97e7c80cb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,4 +10,10 @@ repos: # pre-commit's default_language_version, see # https://pre-commit.com/#top_level-default_language_version language_version: python3.10 - files: schematic/ \ No newline at end of file + files: schematic/ + + - repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + name: isort (python) \ No newline at end of file From ac18c76f8223e7cd461399c68990a7d0cf93e464 Mon Sep 17 00:00:00 2001 From: linglp Date: Fri, 14 Jun 2024 15:20:51 -0400 Subject: [PATCH 084/110] add assert statement --- tests/test_store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_store.py b/tests/test_store.py index 29e91331b..da193282d 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -537,6 +537,7 @@ async def test_store_async_annotation(self, synapse_store: SynapseStorage) -> No result = await synapse_store.store_async_annotation(annos_dict) mock_store_async.assert_called_once_with(synapse_store.syn) + assert result == expected_dict assert isinstance(result, Annotations) From 1ae73359762d51004179ecbcd4340a02f52b0a84 Mon Sep 17 00:00:00 2001 From: linglp Date: Fri, 14 Jun 2024 15:48:40 -0400 Subject: [PATCH 085/110] rename and add typing --- schematic/store/synapse.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 91aa602bb..3c1137da0 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -24,7 +24,7 @@ from time import sleep # allows specifying explicit variable types -from typing import Dict, List, Tuple, Sequence, Union, Optional, Any +from typing import Dict, List, Tuple, Sequence, Union, Optional, Any, Set from synapseclient import ( Synapse, @@ -1692,7 +1692,15 @@ def _create_entity_id(self, idx, row, manifest, datasetId): manifest.loc[idx, "entityId"] = entityId return manifest, entityId - async def _store_annos(self, requests): + async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None: + """Process annotations and store them on synapse asynchronously + + Args: + requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step + + Raises: + RuntimeError: raise a run time error if a task failed to complete + """ while requests: done_tasks, pending_tasks = await asyncio.wait( requests, return_when=asyncio.FIRST_COMPLETED @@ -1790,7 +1798,7 @@ async def add_annotations_to_entities_files( ) ) requests.add(annos_task) - await self._store_annos(requests) + await self._process_store_annos(requests) return manifest def upload_manifest_as_table( From c8a976bd0578150d78b0fbab5a4776847f113f8e Mon Sep 17 00:00:00 2001 From: linglp Date: Sat, 15 Jun 2024 18:04:15 -0400 Subject: [PATCH 086/110] add test of _add_annos and edit comment --- schematic/store/synapse.py | 1 - tests/test_store.py | 84 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 3c1137da0..e6fcac7af 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -1716,7 +1716,6 @@ async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None: entity_id = annos_dict["id"] logger.info(f"Successfully stored annotations for {entity_id}") else: - # remove special characters in annotations entity_id = annos["EntityId"] logger.info( f"Obtained and processed annotations for {entity_id} entity" diff --git a/tests/test_store.py b/tests/test_store.py index da193282d..def4b6d3b 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -9,6 +9,7 @@ from typing import Generator, Any from unittest.mock import patch import shutil +import asyncio import pandas as pd import pytest @@ -541,6 +542,89 @@ async def test_store_async_annotation(self, synapse_store: SynapseStorage) -> No assert isinstance(result, Annotations) + async def test_process_store_annos_failure(self, synapse_store: SynapseStorage) -> None: + """test _process_store_annos function when there's an error either getting or storing annotations + """ + async def mock_failure_coro(): + await asyncio.sleep(0.1) + raise ValueError("sample error") + + # create tasks that will fail + tasks = set() + tasks.add(asyncio.create_task(mock_failure_coro())) + + synapse_store._process_store_annos + # make sure error message can be raised + with pytest.raises(RuntimeError, match="failed with"): + await synapse_store._process_store_annos(tasks) + + async def test_process_store_annos_success_store(self, synapse_store: SynapseStorage) -> None: + """test _process_store_annos function and make sure that annotations can be stored after successfully getting annotations. + """ + # mock annotation obtained after async_store + stored_annos = Annotations( + annotations={ + "Id": ["mock_string"], + "EntityId": ["mock_syn_id"], + "SampleID": [""], + "Component": ["mock value"], + "FileFormat": ["mock_format"], + }, + etag="mock etag", + id="mock_syn_id") + + async def mock_success_coro(): + await asyncio.sleep(0.1) + return stored_annos + + with patch("schematic.store.synapse.SynapseStorage.store_async_annotation",new_callable=AsyncMock) as mock_store_async1: + tasks = set() + tasks.add(asyncio.create_task(mock_success_coro())) + await synapse_store._process_store_annos(tasks) + # make sure that the if statement is working + mock_store_async1.assert_not_called() + + + async def test_process_store_annos_success_get(self, synapse_store: SynapseStorage) -> None: + """test _process_store_annos function and make sure that task of storing annotations can be triggered + """ + # mock annotation obtained after get_async + mock_annos_dict = { + "annotations": { + "id": "mock_syn_id", + "etag": "mock etag", + "annotations": { + "Id": {"type": "STRING", "value": ["mock value"]}, + "EntityId": {"type": "STRING", "value": ["mock_syn_id"]}, + "SampleID": {"type": "STRING", "value": [""]}, + "Component": {"type": "STRING", "value": ["mock value"]}, + }, + }, + "FileFormat": "mock format", + "Component": "mock component", + "Id": "mock_string", + "EntityId": "mock_id", + } + + mock_stored_annos = Annotations( + annotations={ + "Id": ["mock_string"], + "EntityId": ["mock_syn_id"], + }, + etag="mock etag", + id="mock_syn_id") + + async def mock_success_coro(): + await asyncio.sleep(0.1) + return mock_annos_dict + + # make sure that the else statement is working + new_tasks = set() + with patch("schematic.store.synapse.SynapseStorage.store_async_annotation",new_callable=AsyncMock, return_value=mock_stored_annos) as mock_store_async2: + new_tasks.add(asyncio.create_task(mock_success_coro())) + await synapse_store._process_store_annos(new_tasks) + mock_store_async2.assert_called_once() + class TestDatasetFileView: def test_init(self, dataset_id, dataset_fileview, synapse_store): assert dataset_fileview.datasetId == dataset_id From 4addbab81d7b0efac7d9f53abb9d0eae071785cc Mon Sep 17 00:00:00 2001 From: linglp Date: Sat, 15 Jun 2024 22:04:05 -0400 Subject: [PATCH 087/110] fix test --- tests/test_store.py | 47 +++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index def4b6d3b..d4adb36d1 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -1077,7 +1077,7 @@ class TestManifestUpload: ), ], ) - def test_add_annotations_to_entities_files( + async def test_add_annotations_to_entities_files( self, synapse_store: SynapseStorage, dmge: DataModelGraphExplorer, @@ -1097,27 +1097,40 @@ def test_add_annotations_to_entities_files( expected_filenames (list(str)): expected list of file names expected_entity_ids (list(str)): expected list of entity ids """ + async def mock_format_row_annos(): + await asyncio.sleep(0.1) + + async def mock_process_store_annos(requests): + await asyncio.sleep(0.1) + with patch( "schematic.store.synapse.SynapseStorage.getFilesInStorageDataset", return_value=files_in_dataset, ): - manifest_df = pd.DataFrame(original_manifest) + with patch('schematic.store.synapse.SynapseStorage.format_row_annotations', return_value=mock_format_row_annos, new_callable=AsyncMock) as mock_format_row: + with patch('schematic.store.synapse.SynapseStorage._process_store_annos', return_value=mock_process_store_annos, new_callable=AsyncMock) as mock_process_store: + manifest_df = pd.DataFrame(original_manifest) + + new_df = await synapse_store.add_annotations_to_entities_files( + dmge, + manifest_df, + manifest_record_type="entity", + datasetId="mock id", + hideBlanks=True, + ) - new_df = synapse_store.add_annotations_to_entities_files( - dmge, - manifest_df, - manifest_record_type="entity", - datasetId="mock id", - hideBlanks=True, - ) - file_names_lst = new_df["Filename"].tolist() - entity_ids_lst = new_df["entityId"].tolist() - - # test entityId and Id columns get added - assert "entityId" in new_df.columns - assert "Id" in new_df.columns - assert file_names_lst == expected_filenames - assert entity_ids_lst == expected_entity_ids + file_names_lst = new_df["Filename"].tolist() + entity_ids_lst = new_df["entityId"].tolist() + + # test entityId and Id columns get added + assert "entityId" in new_df.columns + assert "Id" in new_df.columns + assert file_names_lst == expected_filenames + assert entity_ids_lst == expected_entity_ids + + # make sure async function gets called as expected + assert mock_format_row.call_count == len(expected_entity_ids) + assert mock_process_store.call_count == 1 @pytest.mark.parametrize( "mock_manifest_file_path", From ff33bbfe719fd94be239bcbdebf38d60fc1280b2 Mon Sep 17 00:00:00 2001 From: linglp Date: Sat, 15 Jun 2024 22:35:40 -0400 Subject: [PATCH 088/110] remove add_annotation function --- schematic/store/synapse.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index e6fcac7af..84ebb0696 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -1643,36 +1643,6 @@ def _generate_table_name(self, manifest): table_name = "synapse_storage_manifest_table" return table_name, component_name - def _add_annotations( - self, - dmge, - row, - entityId: str, - hideBlanks: bool, - annotation_keys: str, - ): - """Helper function to format and add annotations to entities in Synapse. - Args: - dmge: DataModelGraphExplorer object, - row: current row of manifest being processed - entityId (str): synapseId of entity to add annotations to - hideBlanks: Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. - annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display - name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain - display label formatting while ensuring the label is formatted properly for Synapse annotations. - Returns: - Annotations are added to entities in Synapse, no return. - """ - # Format annotations for Synapse - annos = self.format_row_annotations( - dmge, row, entityId, hideBlanks, annotation_keys - ) - - if annos: - # Store annotations for an entity folder - self.syn.set_annotations(annos) - return - def _create_entity_id(self, idx, row, manifest, datasetId): """Helper function to generate an entityId and add it to the appropriate row in the manifest. Args: From f38c043453385d2ea777808ff59e7716443f0f47 Mon Sep 17 00:00:00 2001 From: linglp Date: Sun, 16 Jun 2024 19:44:05 -0400 Subject: [PATCH 089/110] update tracer name --- schematic/manifest/generator.py | 7 +++---- schematic/models/metadata.py | 3 ++- schematic/schemas/data_model_graph.py | 2 +- schematic/schemas/data_model_parser.py | 2 +- schematic/store/synapse.py | 4 ++-- schematic_api/api/routes.py | 16 +++++++--------- 6 files changed, 16 insertions(+), 18 deletions(-) diff --git a/schematic/manifest/generator.py b/schematic/manifest/generator.py index 77ddc085a..d7eb16c30 100644 --- a/schematic/manifest/generator.py +++ b/schematic/manifest/generator.py @@ -35,7 +35,7 @@ from opentelemetry import trace logger = logging.getLogger(__name__) -tracer = trace.get_tracer("generator::ManifestGenerator") +tracer = trace.get_tracer("Schematic") class ManifestGenerator(object): @@ -1575,8 +1575,7 @@ def _handle_output_format_logic( # Default return a DataFrame else: return dataframe - - + @staticmethod @tracer.start_as_current_span("ManifestGenerator::create_single_manifest") def create_single_manifest( @@ -1760,7 +1759,7 @@ def create_manifests( return result return all_results - + @tracer.start_as_current_span("ManifestGenerator::get_manifest") def get_manifest( self, diff --git a/schematic/models/metadata.py b/schematic/models/metadata.py index 76e2ee991..2747f81d7 100644 --- a/schematic/models/metadata.py +++ b/schematic/models/metadata.py @@ -23,7 +23,8 @@ logger = logging.getLogger(__name__) -tracer = trace.get_tracer("metadata::MetadataModel") +tracer = trace.get_tracer("Schematic") + class MetadataModel(object): """Metadata model wrapper around schema.org specification graph. diff --git a/schematic/schemas/data_model_graph.py b/schematic/schemas/data_model_graph.py index cc3b7dd94..38c1ad674 100644 --- a/schematic/schemas/data_model_graph.py +++ b/schematic/schemas/data_model_graph.py @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) -tracer = trace.get_tracer("schemas::DataModelGraph") +tracer = trace.get_tracer("Schematic") class DataModelGraphMeta: # pylint: disable=too-few-public-methods diff --git a/schematic/schemas/data_model_parser.py b/schematic/schemas/data_model_parser.py index cea8c59a5..b4db013bb 100644 --- a/schematic/schemas/data_model_parser.py +++ b/schematic/schemas/data_model_parser.py @@ -17,7 +17,7 @@ logger = logging.getLogger("Schemas") -tracer = trace.get_tracer("Schemas::DataModelParser") +tracer = trace.get_tracer("Schematic") class DataModelParser: diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index cc8d45b89..a78a64f20 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -72,7 +72,7 @@ logger = logging.getLogger("Synapse storage") -tracer = trace.get_tracer("store:SynapseStorage") +tracer = trace.get_tracer("Schematic") @dataclass @@ -244,7 +244,7 @@ def _purge_synapse_cache( self.syn.cache, minutes=minute_buffer ) logger.info( - f"{num_of_deleted_files} files have been deleted from {self.root_synapse_cache}" + f"{num_of_deleted_files} files have been d eleted from {self.root_synapse_cache}" ) else: # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB) diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index 74d009c19..6671edde3 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -56,7 +56,7 @@ ) ) - +# borrowed from: https://github.com/Sage-Bionetworks/synapsePythonClient/blob/develop/tests/integration/conftest.py class FileSpanExporter(ConsoleSpanExporter): """Create an exporter for OTEL data to a file.""" @@ -72,11 +72,13 @@ def export(self, spans) -> None: f.write(span_json_one_line) trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) -processor = SimpleSpanProcessor(FileSpanExporter("otel_spans_schemati_api.json")) -trace.get_tracer_provider().add_span_processor(processor) -tracer = trace.get_tracer("schematic-api") +# processor = SimpleSpanProcessor(FileSpanExporter("otel_spans_schemati_api.json")) +# trace.get_tracer_provider().add_span_processor(processor) +tracer = trace.get_tracer("Schematic") def trace_function_params(): + """capture all the parameters of API requests + """ def decorator(func): @wraps(func) def wrapper(**kwargs): @@ -93,7 +95,6 @@ def wrapper(**kwargs): return decorator - def config_handler(asset_view: str = None): # check if path to config is provided path_to_config = app.config["SCHEMATIC_CONFIG"] @@ -372,8 +373,7 @@ def get_manifest_route( return all_results -#####profile validate manifest route function -# @profile(sort_by='cumulative', strip_dirs=True) +@trace_function_params() def validate_manifest_route( schema_url, data_type, @@ -430,7 +430,6 @@ def validate_manifest_route( #####profile validate manifest route function -# @profile(sort_by='cumulative', strip_dirs=True) @trace_function_params() def submit_manifest_route( schema_url, @@ -726,7 +725,6 @@ def download_manifest(manifest_id, new_manifest_name="", as_json=True): return manifest_local_file_path -# @profile(sort_by='cumulative', strip_dirs=True) def download_dataset_manifest(dataset_id, asset_view, as_json, new_manifest_name=""): # Access token now stored in request header access_token = get_access_token() From 0e05a247d42521477769bc25d88c7ac87c875dd5 Mon Sep 17 00:00:00 2001 From: linglp Date: Sun, 16 Jun 2024 19:48:33 -0400 Subject: [PATCH 090/110] add typing --- schematic_api/api/routes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index 6671edde3..db84a999d 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -44,7 +44,7 @@ from opentelemetry import trace from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.resources import SERVICE_NAME, Resource -from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter, SimpleSpanProcessor +from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter, SimpleSpanProcessor, Span from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter logger = logging.getLogger(__name__) @@ -60,11 +60,11 @@ class FileSpanExporter(ConsoleSpanExporter): """Create an exporter for OTEL data to a file.""" - def __init__(self, file_path) -> None: + def __init__(self, file_path: str) -> None: """Init with a path.""" self.file_path = file_path - def export(self, spans) -> None: + def export(self, spans: List[Span]) -> None: """Export the spans to the file.""" with open(self.file_path, "a", encoding="utf-8") as f: for span in spans: @@ -81,7 +81,7 @@ def trace_function_params(): """ def decorator(func): @wraps(func) - def wrapper(**kwargs): + def wrapper(**kwargs: Any): tracer = trace.get_tracer(__name__) # Start a new span with the function's name with tracer.start_as_current_span(func.__name__) as span: From f7edf601cd6d337fbe042fa65aefd9157a2f4b4d Mon Sep 17 00:00:00 2001 From: linglp Date: Sun, 16 Jun 2024 19:50:41 -0400 Subject: [PATCH 091/110] modify import --- schematic_api/api/routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index db84a999d..5b35a9982 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -19,7 +19,7 @@ import pandas as pd import json -from typing import Optional +from typing import Optional, List, Any from functools import wraps from schematic.configuration.configuration import CONFIG From 514ead6cdd006464a907bc06088b3950d73c3dd8 Mon Sep 17 00:00:00 2001 From: linglp Date: Sun, 16 Jun 2024 19:52:59 -0400 Subject: [PATCH 092/110] add comment --- schematic_api/api/routes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index 5b35a9982..c05604e86 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -80,8 +80,12 @@ def trace_function_params(): """capture all the parameters of API requests """ def decorator(func): + """create a decorator + """ @wraps(func) def wrapper(**kwargs: Any): + """create a wrapper function + """ tracer = trace.get_tracer(__name__) # Start a new span with the function's name with tracer.start_as_current_span(func.__name__) as span: From b113aa5544f0d3055f70fc05d4518ab41dbb8211 Mon Sep 17 00:00:00 2001 From: linglp Date: Mon, 17 Jun 2024 11:06:17 -0400 Subject: [PATCH 093/110] change import location --- schematic/schemas/data_model_graph.py | 2 +- schematic/schemas/data_model_parser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/schematic/schemas/data_model_graph.py b/schematic/schemas/data_model_graph.py index 38c1ad674..4aae01a22 100644 --- a/schematic/schemas/data_model_graph.py +++ b/schematic/schemas/data_model_graph.py @@ -5,6 +5,7 @@ import networkx as nx # type: ignore import graphviz # type: ignore +from opentelemetry import trace from schematic.schemas.data_model_edges import DataModelEdges from schematic.schemas.data_model_nodes import DataModelNodes @@ -19,7 +20,6 @@ from schematic.utils.general import unlist from schematic.utils.viz_utils import visualize from schematic.utils.validate_utils import rule_in_rule_list -from opentelemetry import trace logger = logging.getLogger(__name__) diff --git a/schematic/schemas/data_model_parser.py b/schematic/schemas/data_model_parser.py index b4db013bb..0da26e933 100644 --- a/schematic/schemas/data_model_parser.py +++ b/schematic/schemas/data_model_parser.py @@ -5,6 +5,7 @@ import logging import pandas as pd +from opentelemetry import trace from schematic.utils.df_utils import load_df from schematic.utils.io_utils import load_json @@ -13,7 +14,6 @@ from schematic.schemas.data_model_relationships import DataModelRelationships from schematic import LOADER -from opentelemetry import trace logger = logging.getLogger("Schemas") From 7f4f2c170b21ba424bc4937d61b0c65ac5d2aea0 Mon Sep 17 00:00:00 2001 From: linglp Date: Mon, 17 Jun 2024 15:10:28 -0400 Subject: [PATCH 094/110] black schematic-api --- .pre-commit-config.yaml | 2 +- schematic_api/api/__main__.py | 20 ++++++++++++++++++-- schematic_api/api/routes.py | 13 +++++++++---- schematic_api/api/security_controller_.py | 2 +- 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 623446ced..e4b90e42e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,4 +10,4 @@ repos: # pre-commit's default_language_version, see # https://pre-commit.com/#top_level-default_language_version language_version: python3.10 - files: schematic/ \ No newline at end of file + files: ^(tests|schematic|schematic_api)/ \ No newline at end of file diff --git a/schematic_api/api/__main__.py b/schematic_api/api/__main__.py index 923cebaf5..316ce4e59 100644 --- a/schematic_api/api/__main__.py +++ b/schematic_api/api/__main__.py @@ -1,8 +1,23 @@ import os from schematic_api.api import app +import traceback +import jsonify -def main(): +@app.errorhandler(Exception) +def handle_exception(e): + # Get the last line of the traceback + last_line = traceback.format_exc().strip().split("\n")[-1] + + # Log the full traceback (optional) + app.logger.error(traceback.format_exc()) + + # Return a JSON response with the last line of the error + response = {"status": "error", "message": last_line} + return jsonify(response), 500 + + +def main(): # Get app configuration host = os.environ.get("APP_HOST", "0.0.0.0") port = os.environ.get("APP_PORT", "3001") @@ -11,5 +26,6 @@ def main(): # Launch app app.run(host=host, port=port, debug=False) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index fbf36fbf5..9bc53ff5a 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -39,7 +39,10 @@ SynapseTimeoutError, ) from schematic.utils.general import entity_type_mapping -from schematic.utils.schema_utils import get_property_label_from_display_name, DisplayLabelType +from schematic.utils.schema_utils import ( + get_property_label_from_display_name, + DisplayLabelType, +) logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) @@ -210,6 +213,7 @@ def save_file(file_key="csv_file"): return temp_path + def initalize_metadata_model(schema_url, data_model_labels): # get path to temp data model file (csv or jsonld) as appropriate data_model = get_temp_model_path(schema_url) @@ -393,7 +397,7 @@ def submit_manifest_route( project_scope=None, table_column_names=None, annotation_keys=None, - file_annotations_upload:bool=True, + file_annotations_upload: bool = True, ): # call config_handler() config_handler(asset_view=asset_view) @@ -450,7 +454,7 @@ def submit_manifest_route( project_scope=project_scope, table_column_names=table_column_names, annotation_keys=annotation_keys, - file_annotations_upload=file_annotations_upload + file_annotations_upload=file_annotations_upload, ) return manifest_id @@ -729,6 +733,7 @@ def get_asset_view_table(asset_view, return_type): file_view_table_df.to_csv(export_path, index=False) return export_path + def get_project_manifests(project_id, asset_view): # Access token now stored in request header access_token = get_access_token() @@ -1022,4 +1027,4 @@ def get_schematic_version() -> str: raise NotImplementedError( "Using this endpoint to check the version of schematic is only supported when the API is running in a docker container." ) - return version \ No newline at end of file + return version diff --git a/schematic_api/api/security_controller_.py b/schematic_api/api/security_controller_.py index ee336dcb0..fbde596bb 100644 --- a/schematic_api/api/security_controller_.py +++ b/schematic_api/api/security_controller_.py @@ -11,4 +11,4 @@ def info_from_bearerAuth(token): :return: Decoded token information or None if token is invalid :rtype: dict | None """ - return {"uid": "user_id"} \ No newline at end of file + return {"uid": "user_id"} From 8218fbabc702cfa209c6d966241d5138047ed595 Mon Sep 17 00:00:00 2001 From: linglp Date: Mon, 17 Jun 2024 15:11:24 -0400 Subject: [PATCH 095/110] run black for all the tests --- tests/conftest.py | 11 +- tests/test_api.py | 21 +- tests/test_manifest.py | 129 +++++++--- tests/test_metadata.py | 6 +- tests/test_schemas.py | 13 +- tests/test_store.py | 19 +- tests/test_utils.py | 80 +++--- tests/test_validation.py | 536 ++++++++++++++++++++++++++------------- 8 files changed, 551 insertions(+), 264 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 8d73650ef..62c2cb3e3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -128,19 +128,22 @@ def synapse_store(request): # These fixtures make copies of existing test manifests. -# These copies can the be altered by a given test, and the copy will eb destroyed at the +# These copies can the be altered by a given test, and the copy will eb destroyed at the # end of the test + @pytest.fixture(scope="function") def temporary_file_copy(request, helpers: Helpers) -> Generator[str, None, None]: file_name = request.param # original file copy original_test_path = helpers.get_data_path(f"mock_manifests/{file_name}") # get filename without extension - file_name_no_extension=file_name.split(".")[0] + file_name_no_extension = file_name.split(".")[0] # Copy the original CSV file to a temporary directory - temp_csv_path = helpers.get_data_path(f"mock_manifests/{file_name_no_extension}_copy.csv") - + temp_csv_path = helpers.get_data_path( + f"mock_manifests/{file_name_no_extension}_copy.csv" + ) + shutil.copyfile(original_test_path, temp_csv_path) yield temp_csv_path # Teardown diff --git a/tests/test_api.py b/tests/test_api.py index 15c6786e7..97183186f 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -78,7 +78,7 @@ def test_manifest_json(helpers): @pytest.fixture(scope="class") def data_model_jsonld(): - data_model_jsonld ="https://raw.githubusercontent.com/Sage-Bionetworks/schematic/develop/tests/data/example.model.jsonld" + data_model_jsonld = "https://raw.githubusercontent.com/Sage-Bionetworks/schematic/develop/tests/data/example.model.jsonld" yield data_model_jsonld @@ -143,7 +143,7 @@ class TestSynapseStorage: def test_invalid_authentication(self, client, request_invalid_headers): response = client.get( "http://localhost:3001/v1/storage/assets/tables", - query_string = {"asset_view": "syn23643253", "return_type": "csv"}, + query_string={"asset_view": "syn23643253", "return_type": "csv"}, headers=request_invalid_headers, ) assert response.status_code == 401 @@ -151,7 +151,7 @@ def test_invalid_authentication(self, client, request_invalid_headers): def test_insufficent_auth(self, client, request_headers): response = client.get( "http://localhost:3001/v1/storage/assets/tables", - query_string = {"asset_view": "syn23643252", "return_type": "csv"}, + query_string={"asset_view": "syn23643252", "return_type": "csv"}, headers=request_headers, ) assert response.status_code == 403 @@ -370,8 +370,7 @@ def test_get_property_label_from_display_name(self, client, strict_camel_case): @pytest.mark.schematic_api class TestDataModelGraphExplorerOperation: def test_get_schema(self, client, data_model_jsonld): - params = {"schema_url": data_model_jsonld, - "data_model_labels": 'class_label'} + params = {"schema_url": data_model_jsonld, "data_model_labels": "class_label"} response = client.get( "http://localhost:3001/v1/schemas/get/schema", query_string=params ) @@ -385,7 +384,11 @@ def test_get_schema(self, client, data_model_jsonld): os.remove(response_dt) def test_if_node_required(test, client, data_model_jsonld): - params = {"schema_url": data_model_jsonld, "node_display_name": "FamilyHistory", "data_model_labels": "class_label"} + params = { + "schema_url": data_model_jsonld, + "node_display_name": "FamilyHistory", + "data_model_labels": "class_label", + } response = client.get( "http://localhost:3001/v1/schemas/is_node_required", query_string=params @@ -1121,7 +1124,11 @@ def test_submit_manifest_file_only_replace( elif python_version == "3.9": dataset_id = "syn52656104" - specific_params = {"asset_view": "syn23643253", "dataset_id": dataset_id, "project_scope":["syn54126707"]} + specific_params = { + "asset_view": "syn23643253", + "dataset_id": dataset_id, + "project_scope": ["syn54126707"], + } params.update(specific_params) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 0525b6c6a..da88dda95 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -61,7 +61,9 @@ def manifest_generator(helpers, request): # Get graph data model graph_data_model = generate_graph_data_model( - helpers, path_to_data_model=path_to_data_model, data_model_labels='class_label', + helpers, + path_to_data_model=path_to_data_model, + data_model_labels="class_label", ) manifest_generator = ManifestGenerator( @@ -111,18 +113,22 @@ def manifest(dataset_id, manifest_generator, request): yield manifest, use_annotations, data_type, sheet_url + @pytest.fixture(scope="class") def app(): app = create_app() yield app + class TestManifestGenerator: def test_init(self, helpers): path_to_data_model = helpers.get_data_path("example.model.jsonld") # Get graph data model graph_data_model = generate_graph_data_model( - helpers, path_to_data_model=path_to_data_model, data_model_labels='class_label', + helpers, + path_to_data_model=path_to_data_model, + data_model_labels="class_label", ) generator = ManifestGenerator( @@ -157,7 +163,9 @@ def test_missing_root_error(self, helpers, data_type, exc, exc_message): # Get graph data model graph_data_model = generate_graph_data_model( - helpers, path_to_data_model=path_to_data_model, data_model_labels='class_label', + helpers, + path_to_data_model=path_to_data_model, + data_model_labels="class_label", ) # A LookupError should be raised and include message when the component cannot be found @@ -242,7 +250,9 @@ def test_get_manifest_excel(self, helpers, sheet_url, output_format, dataset_id) # Get graph data model graph_data_model = generate_graph_data_model( - helpers, path_to_data_model=path_to_data_model, data_model_labels='class_label', + helpers, + path_to_data_model=path_to_data_model, + data_model_labels="class_label", ) generator = ManifestGenerator( @@ -300,7 +310,9 @@ def test_get_manifest_no_annos(self, helpers, dataset_id): # Get graph data model graph_data_model = generate_graph_data_model( - helpers, path_to_data_model=path_to_data_model, data_model_labels='class_label', + helpers, + path_to_data_model=path_to_data_model, + data_model_labels="class_label", ) # Instantiate object with use_annotations set to True @@ -416,7 +428,9 @@ def test_add_root_to_component_without_additional_metadata( # Get graph data model graph_data_model = generate_graph_data_model( - helpers, path_to_data_model=path_to_data_model, data_model_labels='class_label', + helpers, + path_to_data_model=path_to_data_model, + data_model_labels="class_label", ) manifest_generator = ManifestGenerator( @@ -453,7 +467,9 @@ def test_add_root_to_component_with_additional_metadata( # Get graph data model graph_data_model = generate_graph_data_model( - helpers, path_to_data_model=path_to_data_model, data_model_labels='class_label', + helpers, + path_to_data_model=path_to_data_model, + data_model_labels="class_label", ) manifest_generator = ManifestGenerator( @@ -537,7 +553,9 @@ def test_update_dataframe_with_existing_df(self, helpers, existing_manifest): # Get graph data model graph_data_model = generate_graph_data_model( - helpers, path_to_data_model=path_to_data_model, data_model_labels='class_label', + helpers, + path_to_data_model=path_to_data_model, + data_model_labels="class_label", ) # Instantiate the Manifest Generator. @@ -661,34 +679,85 @@ def test_populate_existing_excel_spreadsheet( # remove file os.remove(dummy_output_path) - - @pytest.mark.parametrize("return_output", ["Mock excel file path", "Mock google sheet link"]) - def test_create_single_manifest(self, simple_manifest_generator, helpers, return_output): - with patch("schematic.manifest.generator.ManifestGenerator.get_manifest", return_value=return_output): + + @pytest.mark.parametrize( + "return_output", ["Mock excel file path", "Mock google sheet link"] + ) + def test_create_single_manifest( + self, simple_manifest_generator, helpers, return_output + ): + with patch( + "schematic.manifest.generator.ManifestGenerator.get_manifest", + return_value=return_output, + ): json_ld_path = helpers.get_data_path("example.model.jsonld") data_type = "Patient" - graph_data_model = generate_graph_data_model(helpers, path_to_data_model=json_ld_path, data_model_labels='class_label') + graph_data_model = generate_graph_data_model( + helpers, + path_to_data_model=json_ld_path, + data_model_labels="class_label", + ) - result = simple_manifest_generator.create_single_manifest(path_to_data_model=json_ld_path, graph_data_model=graph_data_model, data_type=data_type, output_format="google_sheet", use_annotations=False) + result = simple_manifest_generator.create_single_manifest( + path_to_data_model=json_ld_path, + graph_data_model=graph_data_model, + data_type=data_type, + output_format="google_sheet", + use_annotations=False, + ) assert result == return_output - - @pytest.mark.parametrize("test_data_types", [["Patient", "Biospecimen"], ["all manifests"]]) - def test_create_manifests_raise_errors(self, simple_manifest_generator, helpers, test_data_types): - with pytest.raises(ValueError) as exception_info: + + @pytest.mark.parametrize( + "test_data_types", [["Patient", "Biospecimen"], ["all manifests"]] + ) + def test_create_manifests_raise_errors( + self, simple_manifest_generator, helpers, test_data_types + ): + with pytest.raises(ValueError) as exception_info: json_ld_path = helpers.get_data_path("example.model.jsonld") data_types = test_data_types - dataset_ids=["syn123456"] - - simple_manifest_generator.create_manifests(path_to_data_model=json_ld_path, data_types=data_types, dataset_ids=dataset_ids, output_format="google_sheet", use_annotations=False, data_model_labels='class_label') - - @pytest.mark.parametrize("test_data_types, dataset_ids, expected_result", [ - (["Patient", "Biospecimen"], ["mock dataset id1", "mock dataset id2"], ["mock google sheet link", "mock google sheet link"]), - (["Patient"], ["mock dataset id1"], ["mock google sheet link"]), - ]) - def test_create_manifests(self, simple_manifest_generator, helpers, test_data_types, dataset_ids, expected_result): - with patch("schematic.manifest.generator.ManifestGenerator.create_single_manifest", return_value="mock google sheet link"): + dataset_ids = ["syn123456"] + + simple_manifest_generator.create_manifests( + path_to_data_model=json_ld_path, + data_types=data_types, + dataset_ids=dataset_ids, + output_format="google_sheet", + use_annotations=False, + data_model_labels="class_label", + ) + + @pytest.mark.parametrize( + "test_data_types, dataset_ids, expected_result", + [ + ( + ["Patient", "Biospecimen"], + ["mock dataset id1", "mock dataset id2"], + ["mock google sheet link", "mock google sheet link"], + ), + (["Patient"], ["mock dataset id1"], ["mock google sheet link"]), + ], + ) + def test_create_manifests( + self, + simple_manifest_generator, + helpers, + test_data_types, + dataset_ids, + expected_result, + ): + with patch( + "schematic.manifest.generator.ManifestGenerator.create_single_manifest", + return_value="mock google sheet link", + ): json_ld_path = helpers.get_data_path("example.model.jsonld") - all_results = simple_manifest_generator.create_manifests(path_to_data_model=json_ld_path, data_types=test_data_types, dataset_ids=dataset_ids, output_format="google_sheet", use_annotations=False, data_model_labels='class_label') + all_results = simple_manifest_generator.create_manifests( + path_to_data_model=json_ld_path, + data_types=test_data_types, + dataset_ids=dataset_ids, + output_format="google_sheet", + use_annotations=False, + data_model_labels="class_label", + ) assert all_results == expected_result - diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 8a2c2e965..bf0c4d97b 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -109,10 +109,12 @@ def test_populate_manifest(self, helpers, return_excel, data_model_labels): ids=["data_model_labels-display_label", "data_model_labels-class_label"], ) @pytest.mark.parametrize("validate_component", [None, "BulkRNA-seqAssay"]) - @pytest.mark.parametrize("temporary_file_copy", ["test_BulkRNAseq.csv"], indirect=True) + @pytest.mark.parametrize( + "temporary_file_copy", ["test_BulkRNAseq.csv"], indirect=True + ) def test_submit_metadata_manifest( self, - temporary_file_copy: Generator[str, None, None], + temporary_file_copy: Generator[str, None, None], helpers: Helpers, file_annotations_upload: bool, restrict_rules: bool, diff --git a/tests/test_schemas.py b/tests/test_schemas.py index 61479a3e8..f80449b18 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -20,7 +20,7 @@ convert_bool_to_str, parse_validation_rules, DisplayLabelType, - get_json_schema_log_file_path + get_json_schema_log_file_path, ) from schematic.utils.io_utils import load_json @@ -448,9 +448,12 @@ def test_generate_data_model_graph(self, helpers, data_model, data_model_labels) # Check that all relationships recorded between 'CheckList' and 'Ab' are present assert ( - "rangeValue" and "parentOf" in graph["CheckListEnum"][expected_valid_values[0]] + "rangeValue" + and "parentOf" in graph["CheckListEnum"][expected_valid_values[0]] + ) + assert ( + "requiresDependency" not in graph["CheckListEnum"][expected_valid_values[0]] ) - assert "requiresDependency" not in graph["CheckListEnum"][expected_valid_values[0]] # Check nodes: assert "Patient" in graph.nodes @@ -1325,8 +1328,8 @@ def test_get_json_validation_schema( data_model_path = helpers.get_data_path(path=data_model) json_schema_log_file_path = get_json_schema_log_file_path( - data_model_path=data_model_path, - source_node=source_node) + data_model_path=data_model_path, source_node=source_node + ) # Remove json schema log file if it already exists. if os.path.exists(json_schema_log_file_path): diff --git a/tests/test_store.py b/tests/test_store.py index 06fa4bf23..7a162c6e1 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -22,11 +22,7 @@ from tests.conftest import Helpers from schematic.store.base import BaseStorage -from schematic.store.synapse import ( - DatasetFileView, - ManifestDownload, - SynapseStorage -) +from schematic.store.synapse import DatasetFileView, ManifestDownload, SynapseStorage from schematic.utils.general import check_synapse_cache_size logging.basicConfig(level=logging.DEBUG) @@ -131,7 +127,7 @@ def test_init(self): class TestSynapseStorage: "Tests the SynapseStorage class" - def test_init(self, synapse_store:SynapseStorage) -> None: + def test_init(self, synapse_store: SynapseStorage) -> None: """Tests SynapseStorage.__init__""" assert synapse_store.storageFileview == "syn23643253" assert isinstance(synapse_store.storageFileviewTable, pd.DataFrame) @@ -142,8 +138,7 @@ def test__purge_synapse_cache(self) -> None: synapse_store = SynapseStorage(synapse_cache_path="test_cache_dir") size_before_purge = check_synapse_cache_size(synapse_store.root_synapse_cache) synapse_store._purge_synapse_cache( - maximum_storage_allowed_cache_gb=0.000001, - minute_buffer=0 + maximum_storage_allowed_cache_gb=0.000001, minute_buffer=0 ) size_after_purge = check_synapse_cache_size(synapse_store.root_synapse_cache) assert size_before_purge > size_after_purge @@ -157,7 +152,7 @@ def test_login(self) -> None: assert synapse_client.cache.cache_root_dir == "test_cache_dir" shutil.rmtree("test_cache_dir") - def test_getFileAnnotations(self, synapse_store:SynapseStorage) -> None: + def test_getFileAnnotations(self, synapse_store: SynapseStorage) -> None: expected_dict = { "author": "bruno, milen, sujay", "impact": "42.9", @@ -220,17 +215,17 @@ def test_get_file_entityIds(self, helpers, synapse_store, only_new_files): {"CheckInt": "7", "CheckList": "valid, list, values"}, "syn34295552", "file_and_entities", - "annotations_test_manifest.csv" + "annotations_test_manifest.csv", ), ( {"FileFormat": "BAM", "GenomeBuild": "GRCh38"}, "syn39241199", "table_and_file", - "test_BulkRNAseq.csv" + "test_BulkRNAseq.csv", ), ], ids=["non file-based", "file-based"], - indirect=["temporary_file_copy"] + indirect=["temporary_file_copy"], ) def test_annotation_submission( self, diff --git a/tests/test_utils.py b/tests/test_utils.py index 1ff72d673..5b37abe6e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -11,6 +11,7 @@ from typing import Union, Generator from _pytest.fixtures import FixtureRequest + import numpy as np import pandas as pd import pytest @@ -196,7 +197,8 @@ (1073741825, 1073741824, 1181116006.4), ] -def get_metadataModel(helpers, model_name:str): + +def get_metadataModel(helpers, model_name: str): metadataModel = MetadataModel( inputMModelLocation=helpers.get_data_path(model_name), inputMModelLocationType="local", @@ -1025,26 +1027,27 @@ def test_get_label_from_display_name(self, test_dn: str, data_model_labels: str) return @pytest.mark.parametrize( - "data_model", - list(DATA_MODEL_DICT.keys()), - ids=list(DATA_MODEL_DICT.values()) + "data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()) ) @pytest.mark.parametrize( "source_node", ["Biospecimen", "Patient"], ids=["biospecimen_source", "patient_source"], ) - def test_get_json_schema_log_file_path(self, helpers, data_model:str, source_node: str): + def test_get_json_schema_log_file_path( + self, helpers, data_model: str, source_node: str + ): data_model_path = helpers.get_data_path(path=data_model) json_schema_log_file_path = get_json_schema_log_file_path( - data_model_path=data_model_path, - source_node=source_node) + data_model_path=data_model_path, source_node=source_node + ) # Check that model is not included in the json_schema_log_file_path - assert '.model' not in "data_model" + assert ".model" not in "data_model" # Check the file suffixs are what is expected. - assert ['schema', 'json'] == json_schema_log_file_path.split('.')[-2:] + assert ["schema", "json"] == json_schema_log_file_path.split(".")[-2:] + class TestValidateUtils: def test_validate_schema(self, helpers): @@ -1098,13 +1101,22 @@ def test_validate_property_schema(self, helpers): @pytest.mark.parametrize( ("manifest", "model", "root_node"), - [("mock_manifests/Patient_test_no_entry_for_cond_required_column.manifest.csv", - "example.model.csv", "Patient"), - ("mock_manifests/Valid_Test_Manifest_with_nones.csv", - "example_test_nones.model.csv", "MockComponent")] - ) + [ + ( + "mock_manifests/Patient_test_no_entry_for_cond_required_column.manifest.csv", + "example.model.csv", + "Patient", + ), + ( + "mock_manifests/Valid_Test_Manifest_with_nones.csv", + "example_test_nones.model.csv", + "MockComponent", + ), + ], + ) def test_convert_nan_entries_to_empty_strings( - self, helpers, manifest, model, root_node): + self, helpers, manifest, model, root_node + ): # Get manifest and data model path manifest_path = helpers.get_data_path(manifest) model_path = helpers.get_data_path(model) @@ -1128,37 +1140,37 @@ def test_convert_nan_entries_to_empty_strings( manifest_path, preserve_raw_input=False, allow_na_values=True, - **load_args,) + **load_args, + ) metadataModel = get_metadataModel(helpers, model) # Instantiate Validate manifest, and run manifest validation - # In this step the manifest is modified while running rule + # In this step the manifest is modified while running rule # validation so need to do this step to get the updated manfest. - vm = ValidateManifest( - errors, manifest, manifest_path, dmge, json_schema) + vm = ValidateManifest(errors, manifest, manifest_path, dmge, json_schema) manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules( - manifest, dmge, restrict_rules=False, project_scope=["syn54126707"], + manifest, + dmge, + restrict_rules=False, + project_scope=["syn54126707"], ) # Run convert nan function - output = validate_utils.convert_nan_entries_to_empty_strings( - manifest=manifest - ) + output = validate_utils.convert_nan_entries_to_empty_strings(manifest=manifest) # Compare post rule validation manifest with output manifest looking # for expected nan to empty string conversion - if root_node == 'Patient': - assert manifest['Family History'][0] == [''] - assert output['Family History'][0] == [''] - elif root_node == 'MockComponent': - assert manifest['Check List'][2] == [''] - assert manifest['Check List Like Enum'][2] == [] - assert type(manifest['Check NA'][2]) == type(pd.NA) - - assert output['Check List'][2] == [''] - assert output['Check List Like Enum'][2] == [] - + if root_node == "Patient": + assert manifest["Family History"][0] == [""] + assert output["Family History"][0] == [""] + elif root_node == "MockComponent": + assert manifest["Check List"][2] == [""] + assert manifest["Check List Like Enum"][2] == [] + assert type(manifest["Check NA"][2]) == type(pd.NA) + + assert output["Check List"][2] == [""] + assert output["Check List Like Enum"][2] == [] def test_get_list_robustness(self, helpers): return diff --git a/tests/test_validation.py b/tests/test_validation.py index b2b85851d..9ea47b973 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -27,7 +27,8 @@ def DMGE(helpers): dmge = helpers.get_data_model_graph_explorer(path="example.model.jsonld") yield dmge -def get_metadataModel(helpers, model_name:str): + +def get_metadataModel(helpers, model_name: str): metadataModel = MetadataModel( inputMModelLocation=helpers.get_data_path(model_name), inputMModelLocationType="local", @@ -55,20 +56,47 @@ class TestManifestValidation: @pytest.mark.parametrize( ("model_name", "manifest_name", "root_node"), [ - ("example.model.csv","mock_manifests/Valid_Test_Manifest.csv", "MockComponent"), - ("example.model.csv", "mock_manifests/Patient_test_no_entry_for_cond_required_column.manifest.csv", "Patient"), - ("example_test_nones.model.csv","mock_manifests/Valid_Test_Manifest_with_nones.csv", "MockComponent"), + ( + "example.model.csv", + "mock_manifests/Valid_Test_Manifest.csv", + "MockComponent", + ), + ( + "example.model.csv", + "mock_manifests/Patient_test_no_entry_for_cond_required_column.manifest.csv", + "Patient", + ), + ( + "example_test_nones.model.csv", + "mock_manifests/Valid_Test_Manifest_with_nones.csv", + "MockComponent", + ), + ], + ids=[ + "example_model", + "example_with_no_entry_for_cond_required_columns", + "example_with_nones", ], - ids=["example_model", "example_with_no_entry_for_cond_required_columns", "example_with_nones"], ) @pytest.mark.parametrize( "project_scope", ["syn54126707", "syn55250368", "syn55271234"], - ids=["project_scope_with_manifests", "project_scope_without_manifests", "project_scope_with_empty_manifest"], + ids=[ + "project_scope_with_manifests", + "project_scope_without_manifests", + "project_scope_with_empty_manifest", + ], ) - def test_valid_manifest(self, helpers, model_name:str, manifest_name:str, - root_node:str, project_scope:str, dmge:DataModelGraph): - """ Run the valid manifest in various situations, some of which will generate errors or warnings, + def test_valid_manifest( + self, + helpers, + model_name: str, + manifest_name: str, + root_node: str, + project_scope: str, + dmge: DataModelGraph, + ): + """Run the valid manifest in various situations, some of which will generate errors or warnings, if there are "issues" with target manifests on manifests. Since there are so many parameters, limit the combinations that are being run to the ones that are relevant. Args: @@ -90,16 +118,28 @@ def test_valid_manifest(self, helpers, model_name:str, manifest_name:str, manifest_path = helpers.get_data_path(manifest_name) warning_rule_sets_1 = [ - ('Check Match at Least', 'matchAtLeastOne Patient.PatientID set'), - ('Check Match at Least values', 'matchAtLeastOne MockComponent.checkMatchatLeastvalues value'), - ('Check Match Exactly', 'matchExactlyOne MockComponent.checkMatchExactly set'), - ('Check Match Exactly values', 'matchExactlyOne MockComponent.checkMatchExactlyvalues value'), - ] + ("Check Match at Least", "matchAtLeastOne Patient.PatientID set"), + ( + "Check Match at Least values", + "matchAtLeastOne MockComponent.checkMatchatLeastvalues value", + ), + ( + "Check Match Exactly", + "matchExactlyOne MockComponent.checkMatchExactly set", + ), + ( + "Check Match Exactly values", + "matchExactlyOne MockComponent.checkMatchExactlyvalues value", + ), + ] warning_rule_sets_2 = warning_rule_sets_1[1:] error_rule_sets = [ - ('Check Match None', 'matchNone MockComponent.checkMatchNone set error'), - ('Check Match None values', 'matchNone MockComponent.checkMatchNonevalues value error'), - ] + ("Check Match None", "matchNone MockComponent.checkMatchNone set error"), + ( + "Check Match None values", + "matchNone MockComponent.checkMatchNonevalues value error", + ), + ] # For the standard project scope, models and manifest should pass without warnings or errors if project_scope == "syn54126707": @@ -113,25 +153,34 @@ def test_valid_manifest(self, helpers, model_name:str, manifest_name:str, # When submitting the first manifest for cross manifest validation (MockComponent), check that proper warning # (to alert users that no validation will be run), is raised. The manifest is still valid to submit. - if (project_scope == "syn55250368" and root_node=="MockComponent" and - model_name in ["example.model.csv", "example_test_nones.model.csv"]): + if ( + project_scope == "syn55250368" + and root_node == "MockComponent" + and model_name in ["example.model.csv", "example_test_nones.model.csv"] + ): metadataModel = get_metadataModel(helpers, model_name) errors, warnings = metadataModel.validateModelManifest( manifestPath=manifest_path, rootNode=root_node, project_scope=[project_scope], ) - + for attribute_name, val_rule in warning_rule_sets_1: - assert GenerateError.generate_no_cross_warning( - dmge=dmge, - attribute_name=attribute_name, - val_rule=val_rule)[0] in warnings + assert ( + GenerateError.generate_no_cross_warning( + dmge=dmge, attribute_name=attribute_name, val_rule=val_rule + )[0] + in warnings + ) assert errors == [] - + # When submitting a manifest to a project that contains a manifest without data, ensure that the proper # warnings/errors are raised. - elif project_scope == "syn55271234" and root_node=="MockComponent" and model_name == "example.model.csv": + elif ( + project_scope == "syn55271234" + and root_node == "MockComponent" + and model_name == "example.model.csv" + ): metadataModel = get_metadataModel(helpers, model_name) errors, warnings = metadataModel.validateModelManifest( manifestPath=manifest_path, @@ -139,21 +188,24 @@ def test_valid_manifest(self, helpers, model_name:str, manifest_name:str, project_scope=[project_scope], ) for attribute_name, val_rule in warning_rule_sets_2: - assert GenerateError.generate_no_value_in_manifest_error( - dmge=dmge, - attribute_name=attribute_name, - val_rule=val_rule)[1][0] in warnings - - for attribute_name, val_rule in error_rule_sets: - assert GenerateError.generate_no_value_in_manifest_error( - dmge=dmge, - attribute_name=attribute_name, - val_rule=val_rule)[0][0] in errors + assert ( + GenerateError.generate_no_value_in_manifest_error( + dmge=dmge, attribute_name=attribute_name, val_rule=val_rule + )[1][0] + in warnings + ) + for attribute_name, val_rule in error_rule_sets: + assert ( + GenerateError.generate_no_value_in_manifest_error( + dmge=dmge, attribute_name=attribute_name, val_rule=val_rule + )[0][0] + in errors + ) def test_invalid_manifest(self, helpers, dmge): metadataModel = get_metadataModel(helpers, model_name="example.model.jsonld") - + manifestPath = helpers.get_data_path("mock_manifests/Invalid_Test_Manifest.csv") rootNode = "MockComponent" @@ -164,31 +216,41 @@ def test_invalid_manifest(self, helpers, dmge): ) # Check errors - assert GenerateError.generate_type_error( + assert ( + GenerateError.generate_type_error( val_rule="num", row_num="3", attribute_name="Check Num", invalid_entry="c", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_type_error( + assert ( + GenerateError.generate_type_error( val_rule="int", row_num="3", attribute_name="Check Int", invalid_entry="5.63", dmge=dmge, - )[0] in errors - - assert GenerateError.generate_type_error( + )[0] + in errors + ) + + assert ( + GenerateError.generate_type_error( val_rule="str", row_num="3", attribute_name="Check String", invalid_entry="94", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_list_error( + assert ( + GenerateError.generate_list_error( val_rule="list", list_string="9", row_num="3", @@ -196,9 +258,12 @@ def test_invalid_manifest(self, helpers, dmge): list_error="not_comma_delimited", invalid_entry="9", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_list_error( + assert ( + GenerateError.generate_list_error( val_rule="list", list_string="ab", row_num="4", @@ -206,9 +271,12 @@ def test_invalid_manifest(self, helpers, dmge): list_error="not_comma_delimited", invalid_entry="ab", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_list_error( + assert ( + GenerateError.generate_list_error( val_rule="list", list_string="a c f", row_num="3", @@ -216,9 +284,12 @@ def test_invalid_manifest(self, helpers, dmge): list_error="not_comma_delimited", invalid_entry="a c f", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_list_error( + assert ( + GenerateError.generate_list_error( val_rule="list", list_string="a", row_num="4", @@ -226,9 +297,12 @@ def test_invalid_manifest(self, helpers, dmge): list_error="not_comma_delimited", invalid_entry="a", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_list_error( + assert ( + GenerateError.generate_list_error( val_rule="list", list_string="a", row_num="4", @@ -236,9 +310,12 @@ def test_invalid_manifest(self, helpers, dmge): list_error="not_comma_delimited", invalid_entry="a", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_regex_error( + assert ( + GenerateError.generate_regex_error( val_rule="regex", reg_expression="[a-f]", row_num="3", @@ -246,9 +323,12 @@ def test_invalid_manifest(self, helpers, dmge): module_to_call="match", invalid_entry="m", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_regex_error( + assert ( + GenerateError.generate_regex_error( val_rule="regex", reg_expression="[a-f]", row_num="3", @@ -256,9 +336,12 @@ def test_invalid_manifest(self, helpers, dmge): module_to_call="search", invalid_entry="q", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_regex_error( + assert ( + GenerateError.generate_regex_error( val_rule="regex", reg_expression="^\d+$", row_num="2", @@ -266,9 +349,12 @@ def test_invalid_manifest(self, helpers, dmge): module_to_call="search", invalid_entry="5.4", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_url_error( + assert ( + GenerateError.generate_url_error( val_rule="url", url="http://googlef.com/", url_error="invalid_url", @@ -277,7 +363,9 @@ def test_invalid_manifest(self, helpers, dmge): argument=None, invalid_entry="http://googlef.com/", dmge=dmge, - )[0] in errors + )[0] + in errors + ) date_err = GenerateError.generate_content_error( val_rule="date", @@ -289,21 +377,27 @@ def test_invalid_manifest(self, helpers, dmge): error_in_list = [date_err[2] in error for error in errors] assert any(error_in_list) - assert GenerateError.generate_content_error( + assert ( + GenerateError.generate_content_error( val_rule="unique error", attribute_name="Check Unique", dmge=dmge, row_num=["2", "3", "4"], invalid_entry=["str1"], - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_content_error( + assert ( + GenerateError.generate_content_error( val_rule="inRange 50 100 error", attribute_name="Check Range", dmge=dmge, row_num=["3"], invalid_entry=["30"], - )[0] in errors + )[0] + in errors + ) assert ( GenerateError.generate_cross_warning( @@ -314,7 +408,7 @@ def test_invalid_manifest(self, helpers, dmge): invalid_entry=["123"], dmge=dmge, )[0] - in errors + in errors ) assert ( @@ -325,54 +419,69 @@ def test_invalid_manifest(self, helpers, dmge): invalid_entry=["123"], dmge=dmge, )[0] - in errors + in errors ) # check warnings - assert GenerateError.generate_content_error( + assert ( + GenerateError.generate_content_error( val_rule="recommended", attribute_name="Check Recommended", dmge=dmge, - )[1] in warnings + )[1] + in warnings + ) - assert GenerateError.generate_content_error( + assert ( + GenerateError.generate_content_error( val_rule="protectAges", attribute_name="Check Ages", dmge=dmge, row_num=["2", "3"], invalid_entry=["6549", "32851"], - )[1] in warnings + )[1] + in warnings + ) - assert GenerateError.generate_cross_warning( + assert ( + GenerateError.generate_cross_warning( val_rule="matchAtLeastOne", row_num=["3"], attribute_name="Check Match at Least", invalid_entry=["7163"], manifest_id=["syn54126997", "syn54127001"], dmge=dmge, - )[1] in warnings + )[1] + in warnings + ) - assert GenerateError.generate_cross_warning( + assert ( + GenerateError.generate_cross_warning( val_rule="matchAtLeastOne MockComponent.checkMatchatLeastvalues value", row_num=["3"], attribute_name="Check Match at Least values", invalid_entry=["51100"], dmge=dmge, - )[1] in warnings + )[1] + in warnings + ) - assert \ + assert ( GenerateError.generate_cross_warning( val_rule="matchExactlyOne", attribute_name="Check Match Exactly", matching_manifests=["syn54126950", "syn54127008"], dmge=dmge, - )[1] in warnings \ + )[1] + in warnings or GenerateError.generate_cross_warning( val_rule="matchExactlyOne", attribute_name="Check Match Exactly", matching_manifests=["syn54127702", "syn54127008"], dmge=dmge, - )[1] in warnings + )[1] + in warnings + ) cross_warning = GenerateError.generate_cross_warning( val_rule="matchExactlyOne MockComponent.checkMatchExactlyvalues MockComponent.checkMatchExactlyvalues value", @@ -385,7 +494,6 @@ def test_invalid_manifest(self, helpers, dmge): warning_in_list = [cross_warning[1] in warning for warning in warnings] assert any(warning_in_list) - def test_in_house_validation(self, helpers, dmge): metadataModel = get_metadataModel(helpers, model_name="example.model.jsonld") manifestPath = helpers.get_data_path("mock_manifests/Invalid_Test_Manifest.csv") @@ -399,39 +507,52 @@ def test_in_house_validation(self, helpers, dmge): ) # Check errors - assert GenerateError.generate_type_error( + assert ( + GenerateError.generate_type_error( val_rule="num", row_num="3", attribute_name="Check Num", invalid_entry="c", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_type_error( + assert ( + GenerateError.generate_type_error( val_rule="int", row_num="3", attribute_name="Check Int", invalid_entry="5.63", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_type_error( + assert ( + GenerateError.generate_type_error( val_rule="str", row_num="3", attribute_name="Check String", invalid_entry="94", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_type_error( + assert ( + GenerateError.generate_type_error( val_rule="int", row_num="3", attribute_name="Check NA", invalid_entry="9.5", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_list_error( + assert ( + GenerateError.generate_list_error( val_rule="list", list_string="9", row_num="3", @@ -439,9 +560,12 @@ def test_in_house_validation(self, helpers, dmge): list_error="not_comma_delimited", invalid_entry="9", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_list_error( + assert ( + GenerateError.generate_list_error( val_rule="list", list_string="ab", row_num="4", @@ -449,9 +573,12 @@ def test_in_house_validation(self, helpers, dmge): list_error="not_comma_delimited", invalid_entry="ab", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_regex_error( + assert ( + GenerateError.generate_regex_error( val_rule="regex", reg_expression="[a-f]", row_num="3", @@ -459,9 +586,12 @@ def test_in_house_validation(self, helpers, dmge): module_to_call="search", invalid_entry="q", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_regex_error( + assert ( + GenerateError.generate_regex_error( val_rule="regex", reg_expression="[a-f]", row_num="3", @@ -469,9 +599,12 @@ def test_in_house_validation(self, helpers, dmge): module_to_call="match", invalid_entry="m", dmge=dmge, - )[0] in errors + )[0] + in errors + ) - assert GenerateError.generate_url_error( + assert ( + GenerateError.generate_url_error( val_rule="url", url="http://googlef.com/", url_error="invalid_url", @@ -480,7 +613,9 @@ def test_in_house_validation(self, helpers, dmge): argument=None, invalid_entry="http://googlef.com/", dmge=dmge, - )[0] in errors + )[0] + in errors + ) assert ( GenerateError.generate_cross_warning( @@ -491,7 +626,7 @@ def test_in_house_validation(self, helpers, dmge): invalid_entry=["123"], dmge=dmge, )[0] - in errors + in errors ) assert ( @@ -502,56 +637,66 @@ def test_in_house_validation(self, helpers, dmge): invalid_entry=["123"], dmge=dmge, )[0] - in errors + in errors ) # Check Warnings - assert GenerateError.generate_cross_warning( + assert ( + GenerateError.generate_cross_warning( val_rule="matchAtLeastOne", row_num=["3"], attribute_name="Check Match at Least", invalid_entry=["7163"], manifest_id=["syn54126997", "syn54127001"], dmge=dmge, - )[1] in warnings + )[1] + in warnings + ) - assert GenerateError.generate_cross_warning( + assert ( + GenerateError.generate_cross_warning( val_rule="matchAtLeastOne MockComponent.checkMatchatLeastvalues value", row_num=["3"], attribute_name="Check Match at Least values", invalid_entry=["51100"], dmge=dmge, - )[1] in warnings + )[1] + in warnings + ) - assert \ + assert ( GenerateError.generate_cross_warning( val_rule="matchExactlyOne", attribute_name="Check Match Exactly", matching_manifests=["syn54126950", "syn54127008"], dmge=dmge, - )[1] in warnings \ + )[1] + in warnings or GenerateError.generate_cross_warning( val_rule="matchExactlyOne", attribute_name="Check Match Exactly", matching_manifests=["syn54127702", "syn54127008"], dmge=dmge, - )[1] in warnings + )[1] + in warnings + ) - assert GenerateError.generate_cross_warning( + assert ( + GenerateError.generate_cross_warning( val_rule="matchExactlyOne MockComponent.checkMatchExactlyvalues MockComponent.checkMatchExactlyvalues value", row_num=["2", "3", "4"], attribute_name="Check Match Exactly values", invalid_entry=["71738", "98085", "210065"], dmge=dmge, - )[1] in warnings - + )[1] + in warnings + ) - def test_missing_column(self, helpers, dmge:DataModelGraph): - """ Test that a manifest missing a column returns the proper error. - """ - model_name="example.model.csv" - manifest_name="mock_manifests/Invalid_Biospecimen_Missing_Column_Manifest.csv" - root_node="Biospecimen" + def test_missing_column(self, helpers, dmge: DataModelGraph): + """Test that a manifest missing a column returns the proper error.""" + model_name = "example.model.csv" + manifest_name = "mock_manifests/Invalid_Biospecimen_Missing_Column_Manifest.csv" + root_node = "Biospecimen" manifest_path = helpers.get_data_path(manifest_name) metadataModel = get_metadataModel(helpers, model_name) @@ -560,14 +705,16 @@ def test_missing_column(self, helpers, dmge:DataModelGraph): rootNode=root_node, ) - assert GenerateError.generate_schema_error( - row_num='2', + assert ( + GenerateError.generate_schema_error( + row_num="2", attribute_name="Wrong schema", error_message="'Tissue Status' is a required property", invalid_entry="Wrong schema", dmge=dmge, - )[0] in errors - + )[0] + in errors + ) @pytest.mark.parametrize( "model_name", @@ -577,19 +724,46 @@ def test_missing_column(self, helpers, dmge:DataModelGraph): ], ids=["example_model", "example_with_requirements_from_vr"], ) - @pytest.mark.parametrize( - ["manifest_name", "root_node",], [ - ("mock_manifests/Biospecimen_required_vr_test_fail.manifest.csv", "Biospecimen"), - ("mock_manifests/Biospecimen_required_vr_test_pass.manifest.csv", "Biospecimen"), + "manifest_name", + "root_node", + ], + [ + ( + "mock_manifests/Biospecimen_required_vr_test_fail.manifest.csv", + "Biospecimen", + ), + ( + "mock_manifests/Biospecimen_required_vr_test_pass.manifest.csv", + "Biospecimen", + ), ("mock_manifests/Patient_required_vr_test_pass.manifest.csv", "Patient"), - ("mock_manifests/Patient_test_no_entry_for_cond_required_column.manifest.csv", "Patient"), - ("mock_manifests/BulkRNAseq_component_based_required_rule_test.manifest.csv", "BulkRNA-seqAssay"), + ( + "mock_manifests/Patient_test_no_entry_for_cond_required_column.manifest.csv", + "Patient", + ), + ( + "mock_manifests/BulkRNAseq_component_based_required_rule_test.manifest.csv", + "BulkRNA-seqAssay", + ), + ], + ids=[ + "biospeciment_required_vr_empty", + "biospecimen_required_filled", + "patient_not_required_empty", + "patient_conditionally_required_not_filled", + "bulk_rna_seq_component_based_rule_test", ], - ids=["biospeciment_required_vr_empty", "biospecimen_required_filled", "patient_not_required_empty", "patient_conditionally_required_not_filled", "bulk_rna_seq_component_based_rule_test"], ) - def test_required_validation_rule(self, helpers, model_name:str, manifest_name:str, root_node:str, dmge:DataModelGraphExplorer) -> None: + def test_required_validation_rule( + self, + helpers, + model_name: str, + manifest_name: str, + root_node: str, + dmge: DataModelGraphExplorer, + ) -> None: """ Args: model_name, str: model to run test validation against @@ -630,7 +804,11 @@ def test_required_validation_rule(self, helpers, model_name:str, manifest_name:s rootNode=root_node, ) - error_and_warning_free_manifests = ["Biospecimen_required_vr_test_pass", "Patient_test_no_entry_for_cond_required_column", ""] + error_and_warning_free_manifests = [ + "Biospecimen_required_vr_test_pass", + "Patient_test_no_entry_for_cond_required_column", + "", + ] # For each model, these manifest should pass, bc either the value is being passed as requierd, or its not currently required for manifest in error_and_warning_free_manifests: @@ -638,70 +816,85 @@ def test_required_validation_rule(self, helpers, model_name:str, manifest_name:s assert errors == [] assert warnings == [] - messages = {"patient_id_empty_warning": { - "row_num":"2", - "attribute_name":"Patient ID", - "error_message":"'' should be non-empty", - "invalid_entry":""}, - "bulk_rnaseq_cbr_error_1":{ - "row_num":"3", - "attribute_name":"Genome FASTA", - "error_message":"'' should be non-empty", - "invalid_entry":""}, - "bulk_rnaseq_cbr_error_2":{ - "row_num":"4", - "attribute_name":"File Format", - "error_message":"'' is not one of ['CSV/TSV', 'CRAM', 'FASTQ', 'BAM']", - "invalid_entry":""}, - } + messages = { + "patient_id_empty_warning": { + "row_num": "2", + "attribute_name": "Patient ID", + "error_message": "'' should be non-empty", + "invalid_entry": "", + }, + "bulk_rnaseq_cbr_error_1": { + "row_num": "3", + "attribute_name": "Genome FASTA", + "error_message": "'' should be non-empty", + "invalid_entry": "", + }, + "bulk_rnaseq_cbr_error_2": { + "row_num": "4", + "attribute_name": "File Format", + "error_message": "'' is not one of ['CSV/TSV', 'CRAM', 'FASTQ', 'BAM']", + "invalid_entry": "", + }, + } # This manifest should fail in the example_model bc the manifest Required=False, and in the example_with_requirements_from_vr # bc the requirments are set to false in the validation rule - if (("Biospecimen_required_vr_test_fail" in manifest_name) or - ("Patient_required_vr_test_pass" in manifest_name and model_name == "example.model.csv") - ): + if ("Biospecimen_required_vr_test_fail" in manifest_name) or ( + "Patient_required_vr_test_pass" in manifest_name + and model_name == "example.model.csv" + ): message_key = "patient_id_empty_warning" - assert GenerateError.generate_schema_error( - row_num=messages[message_key]["row_num"], - attribute_name=messages[message_key]["attribute_name"], - error_message=messages[message_key]["error_message"], - invalid_entry=messages[message_key]["invalid_entry"], - dmge=dmge, - )[0] in errors + assert ( + GenerateError.generate_schema_error( + row_num=messages[message_key]["row_num"], + attribute_name=messages[message_key]["attribute_name"], + error_message=messages[message_key]["error_message"], + invalid_entry=messages[message_key]["invalid_entry"], + dmge=dmge, + )[0] + in errors + ) assert warnings == [] - if "Patient_required_vr_test_pass" in manifest_name and model_name == "example_required_vr_test.model.csv": + if ( + "Patient_required_vr_test_pass" in manifest_name + and model_name == "example_required_vr_test.model.csv" + ): assert errors == [] assert warnings == [] if "BulkRNAseq_component_based_required_rule_test" in manifest_name: message_key = "bulk_rnaseq_cbr_error_1" - assert GenerateError.generate_schema_error( + assert ( + GenerateError.generate_schema_error( row_num=messages[message_key]["row_num"], attribute_name=messages[message_key]["attribute_name"], error_message=messages[message_key]["error_message"], invalid_entry=messages[message_key]["invalid_entry"], dmge=dmge, - )[0] in errors + )[0] + in errors + ) message_key = "bulk_rnaseq_cbr_error_2" expected_error = GenerateError.generate_schema_error( - row_num=messages[message_key]["row_num"], - attribute_name=messages[message_key]["attribute_name"], - error_message=messages[message_key]["error_message"], - invalid_entry=messages[message_key]["invalid_entry"], - dmge=dmge, - )[0] + row_num=messages[message_key]["row_num"], + attribute_name=messages[message_key]["attribute_name"], + error_message=messages[message_key]["error_message"], + invalid_entry=messages[message_key]["invalid_entry"], + dmge=dmge, + )[0] # since the valid value order isnt set in error reporting, check a portion of the expected output # Check the error row is expected assert expected_error[1] in errors[1] # Check that one of the values for the expected valid values is present # Extract a valid value - valid_value = expected_error[2].split(',')[-1].split(']')[0].strip(' ').strip("\'") - assert valid_value in errors[1][2] - assert warnings==[] - + valid_value = ( + expected_error[2].split(",")[-1].split("]")[0].strip(" ").strip("'") + ) + assert valid_value in errors[1][2] + assert warnings == [] @pytest.mark.parametrize( "manifest_path", @@ -756,13 +949,16 @@ def test_component_validations(self, helpers, manifest_path, dmge): and vmr_warnings[0][-1] == ["123"] ) - @pytest.mark.rule_combos( reason="This introduces a great number of tests covering every possible rule combination that are only necessary on occasion." ) @pytest.mark.parametrize("base_rule, second_rule", get_rule_combinations()) def test_rule_combinations( - self, helpers, dmge, base_rule, second_rule, + self, + helpers, + dmge, + base_rule, + second_rule, ): """ TODO: Describe what this test is doing. From f9543fcca5d4a1d0595b1fd9ff141de2599b67d5 Mon Sep 17 00:00:00 2001 From: linglp Date: Mon, 17 Jun 2024 15:22:45 -0400 Subject: [PATCH 096/110] remove unintended code --- schematic_api/api/__main__.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/schematic_api/api/__main__.py b/schematic_api/api/__main__.py index 316ce4e59..afc24b44a 100644 --- a/schematic_api/api/__main__.py +++ b/schematic_api/api/__main__.py @@ -1,20 +1,5 @@ import os from schematic_api.api import app -import traceback -import jsonify - - -@app.errorhandler(Exception) -def handle_exception(e): - # Get the last line of the traceback - last_line = traceback.format_exc().strip().split("\n")[-1] - - # Log the full traceback (optional) - app.logger.error(traceback.format_exc()) - - # Return a JSON response with the last line of the error - response = {"status": "error", "message": last_line} - return jsonify(response), 500 def main(): From 47f40ac0e86d07fa5d577d02a027cd23410e4b90 Mon Sep 17 00:00:00 2001 From: Lingling <55448354+linglp@users.noreply.github.com> Date: Mon, 17 Jun 2024 16:53:01 -0400 Subject: [PATCH 097/110] updated typo Co-authored-by: BryanFauble <17128019+BryanFauble@users.noreply.github.com> --- schematic/store/synapse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index a78a64f20..7090030bf 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -244,7 +244,7 @@ def _purge_synapse_cache( self.syn.cache, minutes=minute_buffer ) logger.info( - f"{num_of_deleted_files} files have been d eleted from {self.root_synapse_cache}" + f"{num_of_deleted_files} files have been deleted from {self.root_synapse_cache}" ) else: # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB) From ecde41708f737fcb1f3091053bc296ce68eb49cf Mon Sep 17 00:00:00 2001 From: linglp Date: Mon, 17 Jun 2024 17:36:06 -0400 Subject: [PATCH 098/110] allow positional arguments to be used as well --- schematic_api/api/routes.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index c05604e86..9c44b897f 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -83,17 +83,20 @@ def decorator(func): """create a decorator """ @wraps(func) - def wrapper(**kwargs: Any): - """create a wrapper function + def wrapper(*args, **kwargs): + """create a wrapper function. Any number of positional arguments and keyword arguments can be passed here. """ tracer = trace.get_tracer(__name__) # Start a new span with the function's name with tracer.start_as_current_span(func.__name__) as span: # Set values of parameters as tags + for i, arg in enumerate(args): + span.set_attribute(f'arg{i}', arg) + for name, value in kwargs.items(): span.set_attribute(name, value) # Call the actual function - result = func(**kwargs) + result = func(*args, **kwargs) return result return wrapper return decorator From 29a00a2b1fc244404741f13f06c4182a06833474 Mon Sep 17 00:00:00 2001 From: linglp Date: Tue, 18 Jun 2024 11:42:44 -0400 Subject: [PATCH 099/110] remove sleep in test --- tests/test_store.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index d4adb36d1..98d11fd48 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -546,8 +546,7 @@ async def test_process_store_annos_failure(self, synapse_store: SynapseStorage) """test _process_store_annos function when there's an error either getting or storing annotations """ async def mock_failure_coro(): - await asyncio.sleep(0.1) - raise ValueError("sample error") + raise ValueError("sample error") # create tasks that will fail tasks = set() @@ -572,9 +571,8 @@ async def test_process_store_annos_success_store(self, synapse_store: SynapseSto }, etag="mock etag", id="mock_syn_id") - + async def mock_success_coro(): - await asyncio.sleep(0.1) return stored_annos with patch("schematic.store.synapse.SynapseStorage.store_async_annotation",new_callable=AsyncMock) as mock_store_async1: @@ -615,7 +613,6 @@ async def test_process_store_annos_success_get(self, synapse_store: SynapseStora id="mock_syn_id") async def mock_success_coro(): - await asyncio.sleep(0.1) return mock_annos_dict # make sure that the else statement is working @@ -1098,10 +1095,10 @@ async def test_add_annotations_to_entities_files( expected_entity_ids (list(str)): expected list of entity ids """ async def mock_format_row_annos(): - await asyncio.sleep(0.1) + return async def mock_process_store_annos(requests): - await asyncio.sleep(0.1) + return with patch( "schematic.store.synapse.SynapseStorage.getFilesInStorageDataset", From 0ea1aec431a35dc04ca77f8c797d1ce68de32392 Mon Sep 17 00:00:00 2001 From: linglp Date: Tue, 18 Jun 2024 13:57:16 -0400 Subject: [PATCH 100/110] update gh workflow --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b9e719dda..0b1a152ef 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -90,7 +90,7 @@ jobs: run: | # ran only on certain files for now # add here when checked - poetry run black schematic --check + poetry run black schematic tests schematic_api --check #---------------------------------------------- # type checking/enforcement From aafaf739280c1409c36e32d05f0be334b5cb7d5b Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 19 Jun 2024 11:11:35 -0400 Subject: [PATCH 101/110] add test for changeFileMetaData function --- tests/test_store.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index 7a162c6e1..5ac61f3d0 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -1014,17 +1014,32 @@ def test_upload_manifest_file( } ) with patch("synapseclient.Synapse.store") as syn_store_mock, patch( - "synapseutils.copy_functions.changeFileMetaData" - ): + "schematic.store.synapse.synapseutils.copy_functions.changeFileMetaData" + ) as mock_change_file_metadata: syn_store_mock.return_value.id = "mock manifest id" + mock_component_name = "BulkRNA-seqAssay" mock_file_path = helpers.get_data_path(mock_manifest_file_path) mock_manifest_synapse_file_id = synapse_store.upload_manifest_file( manifest=test_df, metadataManifestPath=mock_file_path, datasetId="mock dataset id", restrict_manifest=True, + component_name=mock_component_name, ) + if "censored" in mock_manifest_file_path: + file_name = ( + f"synapse_storage_manifest_{mock_component_name}_censored.csv" + ) + else: + file_name = f"synapse_storage_manifest_{mock_component_name}.csv" + assert mock_manifest_synapse_file_id == "mock manifest id" + mock_change_file_metadata.assert_called_once_with( + forceVersion=False, + syn=synapse_store.syn, + entity=syn_store_mock.return_value.id, + downloadAs=file_name, + ) @pytest.mark.parametrize("file_annotations_upload", [True, False]) @pytest.mark.parametrize("hide_blanks", [True, False]) From ff141f1f36d1f1cde04b35a7da770913d7d148dd Mon Sep 17 00:00:00 2001 From: linglp Date: Wed, 19 Jun 2024 13:01:07 -0400 Subject: [PATCH 102/110] run black, remove unused commit --- schematic_api/api/__init__.py | 15 +++++++------- schematic_api/api/routes.py | 37 +++++++++++++++++++++++------------ 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/schematic_api/api/__init__.py b/schematic_api/api/__init__.py index 03eec9e5d..a65398ee5 100644 --- a/schematic_api/api/__init__.py +++ b/schematic_api/api/__init__.py @@ -15,12 +15,9 @@ config = Config( config={ - 'enabled': True, - 'sampler': { - 'type': 'const', - 'param': 1 - }, - 'logging': True, + "enabled": True, + "sampler": {"type": "const", "param": 1}, + "logging": True, }, service_name="schema-api", ) @@ -74,11 +71,13 @@ def handle_synapse_access_error(e: Exception) -> Tuple[str, int]: app = create_app() -flask_tracer = FlaskTracer(jaeger_tracer, True, app, ['url', 'url_rule', 'environ.HTTP_X_REAL_IP', 'path']) +flask_tracer = FlaskTracer( + jaeger_tracer, True, app, ["url", "url_rule", "environ.HTTP_X_REAL_IP", "path"] +) # def route_code(): # import flask_schematic as sc # sc.method1() -#] +# ] # diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index 0387cdb87..685330109 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -25,7 +25,11 @@ from opentelemetry import trace from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.resources import SERVICE_NAME, Resource -from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter, SimpleSpanProcessor, Span +from opentelemetry.sdk.trace.export import ( + BatchSpanProcessor, + ConsoleSpanExporter, + Span, +) from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from schematic.configuration.configuration import CONFIG @@ -46,20 +50,23 @@ SynapseTimeoutError, ) from schematic.utils.general import entity_type_mapping -from schematic.utils.schema_utils import get_property_label_from_display_name, DisplayLabelType from schematic.utils.schema_utils import ( get_property_label_from_display_name, DisplayLabelType, +) +from schematic.utils.schema_utils import ( + get_property_label_from_display_name, + DisplayLabelType, +) logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) trace.set_tracer_provider( - TracerProvider( - resource=Resource(attributes={SERVICE_NAME: "schematic-api"}) - ) + TracerProvider(resource=Resource(attributes={SERVICE_NAME: "schematic-api"})) ) + # borrowed from: https://github.com/Sage-Bionetworks/synapsePythonClient/blob/develop/tests/integration/conftest.py class FileSpanExporter(ConsoleSpanExporter): """Create an exporter for OTEL data to a file.""" @@ -75,34 +82,37 @@ def export(self, spans: List[Span]) -> None: span_json_one_line = span.to_json().replace("\n", "") + "\n" f.write(span_json_one_line) + trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) # processor = SimpleSpanProcessor(FileSpanExporter("otel_spans_schemati_api.json")) # trace.get_tracer_provider().add_span_processor(processor) tracer = trace.get_tracer("Schematic") + def trace_function_params(): - """capture all the parameters of API requests - """ + """capture all the parameters of API requests""" + def decorator(func): - """create a decorator - """ + """create a decorator""" + @wraps(func) def wrapper(*args, **kwargs): - """create a wrapper function. Any number of positional arguments and keyword arguments can be passed here. - """ + """create a wrapper function. Any number of positional arguments and keyword arguments can be passed here.""" tracer = trace.get_tracer(__name__) # Start a new span with the function's name with tracer.start_as_current_span(func.__name__) as span: - # Set values of parameters as tags + # Set values of parameters as tags for i, arg in enumerate(args): - span.set_attribute(f'arg{i}', arg) + span.set_attribute(f"arg{i}", arg) for name, value in kwargs.items(): span.set_attribute(name, value) # Call the actual function result = func(*args, **kwargs) return result + return wrapper + return decorator @@ -271,6 +281,7 @@ def save_file(file_key="csv_file"): return temp_path + @tracer.start_as_current_span("routes:initalize_metadata_model") def initalize_metadata_model(schema_url, data_model_labels): # get path to temp data model file (csv or jsonld) as appropriate From 9229884f68004a3b25121b11809d9baa64fd6293 Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 20 Jun 2024 15:27:56 -0400 Subject: [PATCH 103/110] remove tracing on initialize data model to avoid confusion --- schematic_api/api/routes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index 685330109..b2439133e 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -282,7 +282,6 @@ def save_file(file_key="csv_file"): return temp_path -@tracer.start_as_current_span("routes:initalize_metadata_model") def initalize_metadata_model(schema_url, data_model_labels): # get path to temp data model file (csv or jsonld) as appropriate data_model = get_temp_model_path(schema_url) From 3577f136b438c0282e98570c8c2b45381820f1e3 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Thu, 20 Jun 2024 14:19:26 -0700 Subject: [PATCH 104/110] add test for bugfix --- tests/test_manifest.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index da88dda95..ba544c4d1 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -761,3 +761,32 @@ def test_create_manifests( data_model_labels="class_label", ) assert all_results == expected_result + + def test_get_record_based_manifest_with_files(self, helpers): + """ + Test to ensure that when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well + """ + path_to_data_model = helpers.get_data_path("example.model.jsonld") + + graph_data_model = generate_graph_data_model( + helpers, + path_to_data_model=path_to_data_model, + data_model_labels="class_label", + ) + + generator = ManifestGenerator( + path_to_data_model=path_to_data_model, + graph=graph_data_model, + root="Biospecimen", + use_annotations=True, + ) + + manifest = generator.get_manifest( + dataset_id="syn61260107", output_format="dataframe" + ) + + filename_not_in_manifest_columns = "Filename" not in manifest.columns + n_rows = manifest.shape[0] + + assert filename_not_in_manifest_columns + assert n_rows == 4 From 4e004b6feb964559510aad7c7ac3cde3c6d51bcb Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Thu, 20 Jun 2024 14:20:46 -0700 Subject: [PATCH 105/110] change behavior for records based manifests --- schematic/store/synapse.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 05618df72..688bbce4f 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -746,22 +746,26 @@ def updateDatasetManifestFiles( manifest_filepath = self.syn.get(manifest_id).path manifest = load_df(manifest_filepath) + manifest_is_file_based = "Filename" in manifest.columns - # update manifest with additional filenames, if any - # note that if there is an existing manifest and there are files in the dataset - # the columns Filename and entityId are assumed to be present in manifest schema - # TODO: use idiomatic panda syntax + if manifest_is_file_based: + # update manifest with additional filenames, if any + # note that if there is an existing manifest and there are files in the dataset + # the columns Filename and entityId are assumed to be present in manifest schema + # TODO: use idiomatic panda syntax - dataset_files, manifest = self.fill_in_entity_id_filename(datasetId, manifest) - if dataset_files: - # update the manifest file, so that it contains the relevant entity IDs - if store: - manifest.to_csv(manifest_filepath, index=False) - - # store manifest and update associated metadata with manifest on Synapse - manifest_id = self.associateMetadataWithFiles( - dmge, manifest_filepath, datasetId - ) + dataset_files, manifest = self.fill_in_entity_id_filename( + datasetId, manifest + ) + if dataset_files: + # update the manifest file, so that it contains the relevant entity IDs + if store: + manifest.to_csv(manifest_filepath, index=False) + + # store manifest and update associated metadata with manifest on Synapse + manifest_id = self.associateMetadataWithFiles( + dmge, manifest_filepath, datasetId + ) return manifest_id, manifest From adc4c012a7aa1bf311a0280ef9eaaf8d83d01ff4 Mon Sep 17 00:00:00 2001 From: linglp Date: Fri, 21 Jun 2024 09:31:57 -0400 Subject: [PATCH 106/110] fix test --- tests/test_store.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index 79b1e8bd7..cd5f4385b 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -1232,9 +1232,14 @@ def test_upload_manifest_as_csv( hide_blanks: bool, restrict: bool, ) -> None: + async def mock_add_annotations_to_entities_files(): + return + with ( patch( - "schematic.store.synapse.SynapseStorage.add_annotations_to_entities_files" + "schematic.store.synapse.SynapseStorage.add_annotations_to_entities_files", + return_value=mock_add_annotations_to_entities_files, + new_callable=AsyncMock, ) as add_anno_mock, patch( "schematic.store.synapse.SynapseStorage.upload_manifest_file", @@ -1282,13 +1287,19 @@ def test_upload_manifest_as_table( manifest_record_type: str, ) -> None: mock_df = pd.DataFrame() + + async def mock_add_annotations_to_entities_files(): + return + with ( patch( "schematic.store.synapse.SynapseStorage.uploadDB", return_value=["mock_table_id", mock_df, "mock_table_manifest"], ) as update_db_mock, patch( - "schematic.store.synapse.SynapseStorage.add_annotations_to_entities_files" + "schematic.store.synapse.SynapseStorage.add_annotations_to_entities_files", + return_value=mock_add_annotations_to_entities_files, + new_callable=AsyncMock, ) as add_anno_mock, patch( "schematic.store.synapse.SynapseStorage.upload_manifest_file", @@ -1342,13 +1353,19 @@ def test_upload_manifest_combo( mock_df = pd.DataFrame() manifest_path = helpers.get_data_path("mock_manifests/test_BulkRNAseq.csv") manifest_df = helpers.get_data_frame(manifest_path) + + async def mock_add_annotations_to_entities_files(): + return + with ( patch( "schematic.store.synapse.SynapseStorage.uploadDB", return_value=["mock_table_id", mock_df, "mock_table_manifest"], ) as update_db_mock, patch( - "schematic.store.synapse.SynapseStorage.add_annotations_to_entities_files" + "schematic.store.synapse.SynapseStorage.add_annotations_to_entities_files", + return_value=mock_add_annotations_to_entities_files, + new_callable=AsyncMock, ) as add_anno_mock, patch( "schematic.store.synapse.SynapseStorage.upload_manifest_file", From 0a75bd13adb36d6de56604e38be2947838dfaba1 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Fri, 21 Jun 2024 09:24:09 -0700 Subject: [PATCH 107/110] add test for file based manifest as well --- tests/test_manifest.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index ba544c4d1..62ae0b82b 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -762,7 +762,11 @@ def test_create_manifests( ) assert all_results == expected_result - def test_get_record_based_manifest_with_files(self, helpers): + @pytest.mark.parametrize( + "component,datasetId", + [("Biospecimen", "syn61260107"), ("BulkRNA-seqAssay", "syn61374924")], + ) + def test_get_record_based_manifest_with_files(self, helpers, component, datasetId): """ Test to ensure that when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well """ @@ -777,16 +781,20 @@ def test_get_record_based_manifest_with_files(self, helpers): generator = ManifestGenerator( path_to_data_model=path_to_data_model, graph=graph_data_model, - root="Biospecimen", + root=component, use_annotations=True, ) manifest = generator.get_manifest( - dataset_id="syn61260107", output_format="dataframe" + dataset_id=datasetId, output_format="dataframe" ) - filename_not_in_manifest_columns = "Filename" not in manifest.columns + filename_in_manifest_columns = "Filename" in manifest.columns n_rows = manifest.shape[0] - assert filename_not_in_manifest_columns - assert n_rows == 4 + if component == "Biospecimen": + assert not filename_in_manifest_columns + assert n_rows == 4 + elif component == "BulkRNA-seqAssay": + assert filename_in_manifest_columns + assert n_rows == 3 From 4141053b460e9eca242a408fd3eb0f160fa1bbd9 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Fri, 21 Jun 2024 09:28:28 -0700 Subject: [PATCH 108/110] update test ids and name --- tests/test_manifest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 62ae0b82b..c34148dab 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -765,8 +765,9 @@ def test_create_manifests( @pytest.mark.parametrize( "component,datasetId", [("Biospecimen", "syn61260107"), ("BulkRNA-seqAssay", "syn61374924")], + ids=["Record based", "File based"], ) - def test_get_record_based_manifest_with_files(self, helpers, component, datasetId): + def test_get_manifest_with_files(self, helpers, component, datasetId): """ Test to ensure that when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well """ From 76fb30538203273b1871edc4bdca3c4c77fd5736 Mon Sep 17 00:00:00 2001 From: Lingling <55448354+linglp@users.noreply.github.com> Date: Thu, 27 Jun 2024 15:39:31 -0400 Subject: [PATCH 109/110] Revert "feat: set annotations in an async way when submitting a manifest" --- poetry.lock | 54 +++------ pyproject.toml | 4 +- schematic/store/synapse.py | 180 ++++++++++------------------ tests/test_store.py | 232 +++---------------------------------- 4 files changed, 98 insertions(+), 372 deletions(-) diff --git a/poetry.lock b/poetry.lock index efab907c7..203239b7a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -190,19 +190,6 @@ files = [ [package.dependencies] typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} -[[package]] -name = "asyncio" -version = "3.4.3" -description = "reference implementation of PEP 3156" -optional = false -python-versions = "*" -files = [ - {file = "asyncio-3.4.3-cp33-none-win32.whl", hash = "sha256:b62c9157d36187eca799c378e572c969f0da87cd5fc42ca372d92cdb06e7e1de"}, - {file = "asyncio-3.4.3-cp33-none-win_amd64.whl", hash = "sha256:c46a87b48213d7464f22d9a497b9eef8c1928b68320a2fa94240f969f6fec08c"}, - {file = "asyncio-3.4.3-py3-none-any.whl", hash = "sha256:c4d18b22701821de07bd6aea8b53d21449ec0ec5680645e5317062ea21817d2d"}, - {file = "asyncio-3.4.3.tar.gz", hash = "sha256:83360ff8bc97980e4ff25c964c7bd3923d333d177aa4f7fb736b019f26c7cb41"}, -] - [[package]] name = "asyncio-atexit" version = "1.0.1" @@ -3243,24 +3230,6 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] -[[package]] -name = "pytest-asyncio" -version = "0.23.7" -description = "Pytest support for asyncio" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pytest_asyncio-0.23.7-py3-none-any.whl", hash = "sha256:009b48127fbe44518a547bddd25611551b0e43ccdbf1e67d12479f569832c20b"}, - {file = "pytest_asyncio-0.23.7.tar.gz", hash = "sha256:5f5c72948f4c49e7db4f29f2521d4031f1c27f86e57b046126654083d4770268"}, -] - -[package.dependencies] -pytest = ">=7.0.0,<9" - -[package.extras] -docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"] -testing = ["coverage (>=6.2)", "hypothesis (>=5.7.1)"] - [[package]] name = "pytest-cov" version = "4.1.0" @@ -3431,6 +3400,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -3438,8 +3408,16 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -3456,6 +3434,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -3463,6 +3442,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -4399,13 +4379,13 @@ Jinja2 = ">=2.0" [[package]] name = "synapseclient" -version = "4.3.0" +version = "4.2.0" description = "A client for Synapse, a collaborative, open-source research platform that allows teams to share data, track analyses, and collaborate." optional = false python-versions = ">=3.8" files = [ - {file = "synapseclient-4.3.0-py3-none-any.whl", hash = "sha256:5d8107cfff4031a0a46d60a3c9a8120300190fa27df4983d883dc951d8bd885f"}, - {file = "synapseclient-4.3.0.tar.gz", hash = "sha256:a1149a64b3281669d42c69e210677a902478b8f6b302966d518473c7384f6387"}, + {file = "synapseclient-4.2.0-py3-none-any.whl", hash = "sha256:ab5bc9c2bf5b90f271f1a9478eff7e9fca3e573578401ac706383ddb984d7a13"}, + {file = "synapseclient-4.2.0.tar.gz", hash = "sha256:89222661125de1795b1a096cf8c58b8115c19d6b0fa5846ed2a41cdb394ef773"}, ] [package.dependencies] @@ -4425,11 +4405,11 @@ urllib3 = ">=1.26.18,<2" [package.extras] boto3 = ["boto3 (>=1.7.0,<2.0)"] -dev = ["black", "flake8 (>=3.7.0,<4.0)", "func-timeout (>=4.3,<5.0)", "pandas (>=1.5,<3.0)", "pre-commit", "pytest (>=7.0.0,<8.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-cov (>=4.1.0,<4.2.0)", "pytest-mock (>=3.0,<4.0)", "pytest-rerunfailures (>=12.0,<13.0)", "pytest-socket (>=0.6.0,<0.7.0)", "pytest-xdist[psutil] (>=2.2,<3.0.0)"] +dev = ["black", "flake8 (>=3.7.0,<4.0)", "func-timeout (>=4.3,<5.0)", "pre-commit", "pytest (>=6.0.0,<7.0)", "pytest-asyncio (>=0.19,<1.0)", "pytest-cov (>=4.1.0,<4.2.0)", "pytest-mock (>=3.0,<4.0)", "pytest-rerunfailures (>=12.0,<13.0)", "pytest-socket (>=0.6.0,<0.7.0)", "pytest-xdist[psutil] (>=2.2,<3.0.0)"] docs = ["markdown-include (>=0.8.1,<0.9.0)", "mkdocs (>=1.5.3)", "mkdocs-material (>=9.4.14)", "mkdocs-open-in-new-tab (>=1.0.3,<1.1.0)", "mkdocstrings (>=0.24.0)", "mkdocstrings-python (>=1.7.5)", "termynal (>=0.11.1)"] pandas = ["pandas (>=1.5,<3.0)"] pysftp = ["pysftp (>=0.2.8,<0.3)"] -tests = ["flake8 (>=3.7.0,<4.0)", "func-timeout (>=4.3,<5.0)", "pandas (>=1.5,<3.0)", "pytest (>=7.0.0,<8.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-cov (>=4.1.0,<4.2.0)", "pytest-mock (>=3.0,<4.0)", "pytest-rerunfailures (>=12.0,<13.0)", "pytest-socket (>=0.6.0,<0.7.0)", "pytest-xdist[psutil] (>=2.2,<3.0.0)"] +tests = ["flake8 (>=3.7.0,<4.0)", "func-timeout (>=4.3,<5.0)", "pytest (>=6.0.0,<7.0)", "pytest-asyncio (>=0.19,<1.0)", "pytest-cov (>=4.1.0,<4.2.0)", "pytest-mock (>=3.0,<4.0)", "pytest-rerunfailures (>=12.0,<13.0)", "pytest-socket (>=0.6.0,<0.7.0)", "pytest-xdist[psutil] (>=2.2,<3.0.0)"] [[package]] name = "tabulate" @@ -4964,4 +4944,4 @@ aws = ["uWSGI"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.11" -content-hash = "a3048c0808e73fd19f5175897e9dda47a2a593422dd4744886615ac453a42139" +content-hash = "5bf0c831977694ea541db24481181ec1980ec9589a2adbd9f30ed0fe7f2b2742" diff --git a/pyproject.toml b/pyproject.toml index 8d941b8ae..3c2795140 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,7 @@ pygsheets = "^2.0.4" PyYAML = "^6.0.0" rdflib = "^6.0.0" setuptools = "^66.0.0" -synapseclient = "^4.3.0" +synapseclient = "^4.1.0" tenacity = "^8.0.1" toml = "^0.10.2" great-expectations = "^0.15.0" @@ -74,8 +74,6 @@ Flask = {version = "2.1.3", optional = true} Flask-Cors = {version = "^3.0.10", optional = true} uWSGI = {version = "^2.0.21", optional = true} Jinja2 = {version = ">2.11.3", optional = true} -asyncio = "^3.4.3" -pytest-asyncio = "^0.23.7" jaeger-client = {version = "^4.8.0", optional = true} flask-opentracing = {version="^2.0.0", optional = true} diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 71711bbae..688bbce4f 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -11,7 +11,6 @@ import secrets import shutil import synapseclient -from synapseclient.api import get_entity_id_bundle2 import uuid # used to generate unique names for entities from tenacity import ( @@ -24,7 +23,7 @@ from time import sleep # allows specifying explicit variable types -from typing import Dict, List, Tuple, Sequence, Union, Optional, Any, Set +from typing import Dict, List, Tuple, Sequence, Union, Optional from synapseclient import ( Synapse, @@ -69,9 +68,6 @@ from schematic.store.base import BaseStorage from schematic.exceptions import AccessCredentialsError from schematic.configuration.configuration import CONFIG -from synapseclient.models.annotations import Annotations -import asyncio -from dataclasses import asdict from opentelemetry import trace logger = logging.getLogger("Synapse storage") @@ -711,6 +707,7 @@ def fill_in_entity_id_filename( new_files = self._get_file_entityIds( dataset_files=dataset_files, only_new_files=True, manifest=manifest ) + # update manifest so that it contains new dataset files new_files = pd.DataFrame(new_files) manifest = ( @@ -1358,43 +1355,8 @@ def upload_manifest_file( return manifest_synapse_file_id - async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]: - """get annotations asynchronously - - Args: - synapse_id (str): synapse id of the entity that the annotation belongs - - Returns: - Dict[str, Any]: The requested entity bundle matching - - """ - return await get_entity_id_bundle2( - entity_id=synapse_id, - request={"includeAnnotations": True}, - synapse_client=self.syn, - ) - - async def store_async_annotation(self, annotation_dict: dict) -> Annotations: - """store annotation in an async way - - Args: - annotation_dict (dict): annotation in a dictionary format - - Returns: - Annotations: The stored annotations. - """ - annotation_data = Annotations.from_dict( - synapse_annotations=annotation_dict["annotations"]["annotations"] - ) - annotation_class = Annotations( - annotations=annotation_data, - etag=annotation_dict["annotations"]["etag"], - id=annotation_dict["annotations"]["id"], - ) - return await annotation_class.store_async(self.syn) - @missing_entity_handler - async def format_row_annotations( + def format_row_annotations( self, dmge, row, entityId: str, hideBlanks: bool, annotation_keys: str ): # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis) @@ -1426,8 +1388,7 @@ async def format_row_annotations( metadataSyn[keySyn] = v # set annotation(s) for the various objects/items in a dataset on Synapse - annos = await self.get_async_annotation(entityId) - + annos = self.syn.get_annotations(entityId) csv_list_regex = comma_separated_list_regex() for anno_k, anno_v in metadataSyn.items(): # Remove keys with nan or empty string values from dict of annotations to be uploaded @@ -1664,6 +1625,37 @@ def _generate_table_name(self, manifest): table_name = "synapse_storage_manifest_table" return table_name, component_name + @tracer.start_as_current_span("SynapseStorage::_add_annotations") + def _add_annotations( + self, + dmge, + row, + entityId: str, + hideBlanks: bool, + annotation_keys: str, + ): + """Helper function to format and add annotations to entities in Synapse. + Args: + dmge: DataModelGraphExplorer object, + row: current row of manifest being processed + entityId (str): synapseId of entity to add annotations to + hideBlanks: Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. + annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display + name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain + display label formatting while ensuring the label is formatted properly for Synapse annotations. + Returns: + Annotations are added to entities in Synapse, no return. + """ + # Format annotations for Synapse + annos = self.format_row_annotations( + dmge, row, entityId, hideBlanks, annotation_keys + ) + + if annos: + # Store annotations for an entity folder + self.syn.set_annotations(annos) + return + def _create_entity_id(self, idx, row, manifest, datasetId): """Helper function to generate an entityId and add it to the appropriate row in the manifest. Args: @@ -1683,45 +1675,8 @@ def _create_entity_id(self, idx, row, manifest, datasetId): manifest.loc[idx, "entityId"] = entityId return manifest, entityId - async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None: - """Process annotations and store them on synapse asynchronously - - Args: - requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step - - Raises: - RuntimeError: raise a run time error if a task failed to complete - """ - while requests: - done_tasks, pending_tasks = await asyncio.wait( - requests, return_when=asyncio.FIRST_COMPLETED - ) - requests = pending_tasks - - for completed_task in done_tasks: - try: - annos = completed_task.result() - - if isinstance(annos, Annotations): - annos_dict = asdict(annos) - entity_id = annos_dict["id"] - logger.info(f"Successfully stored annotations for {entity_id}") - else: - entity_id = annos["EntityId"] - logger.info( - f"Obtained and processed annotations for {entity_id} entity" - ) - if annos: - requests.add( - asyncio.create_task( - self.store_async_annotation(annotation_dict=annos) - ) - ) - except Exception as e: - raise RuntimeError(f"failed with { repr(e) }.") from e - @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files") - async def add_annotations_to_entities_files( + def add_annotations_to_entities_files( self, dmge, manifest, @@ -1762,7 +1717,6 @@ async def add_annotations_to_entities_files( ).drop("entityId_x", axis=1) # Fill `entityId` for each row if missing and annotate entity as appropriate - requests = set() for idx, row in manifest.iterrows(): if not row["entityId"] and ( manifest_record_type == "file_and_entities" @@ -1782,14 +1736,8 @@ async def add_annotations_to_entities_files( # Adding annotations to connected files. if entityId: - # Format annotations for Synapse - annos_task = asyncio.create_task( - self.format_row_annotations( - dmge, row, entityId, hideBlanks, annotation_keys - ) - ) - requests.add(annos_task) - await self._process_store_annos(requests) + self._add_annotations(dmge, row, entityId, hideBlanks, annotation_keys) + logger.info(f"Added annotations to entity: {entityId}") return manifest @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table") @@ -1843,16 +1791,14 @@ def upload_manifest_as_table( ) if file_annotations_upload: - manifest = asyncio.run( - self.add_annotations_to_entities_files( - dmge, - manifest, - manifest_record_type, - datasetId, - hideBlanks, - manifest_synapse_table_id, - annotation_keys, - ) + manifest = self.add_annotations_to_entities_files( + dmge, + manifest, + manifest_record_type, + datasetId, + hideBlanks, + manifest_synapse_table_id, + annotation_keys, ) # Load manifest to synapse as a CSV File manifest_synapse_file_id = self.upload_manifest_file( @@ -1919,15 +1865,13 @@ def upload_manifest_as_csv( manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. """ if file_annotations_upload: - manifest = asyncio.run( - self.add_annotations_to_entities_files( - dmge, - manifest, - manifest_record_type, - datasetId, - hideBlanks, - annotation_keys=annotation_keys, - ) + manifest = self.add_annotations_to_entities_files( + dmge, + manifest, + manifest_record_type, + datasetId, + hideBlanks, + annotation_keys=annotation_keys, ) # Load manifest to synapse as a CSV File @@ -1999,16 +1943,14 @@ def upload_manifest_combo( ) if file_annotations_upload: - manifest = asyncio.run( - self.add_annotations_to_entities_files( - dmge, - manifest, - manifest_record_type, - datasetId, - hideBlanks, - manifest_synapse_table_id, - annotation_keys=annotation_keys, - ) + manifest = self.add_annotations_to_entities_files( + dmge, + manifest, + manifest_record_type, + datasetId, + hideBlanks, + manifest_synapse_table_id, + annotation_keys=annotation_keys, ) # Load manifest to synapse as a CSV File diff --git a/tests/test_store.py b/tests/test_store.py index cd5f4385b..5ac61f3d0 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -9,7 +9,6 @@ from typing import Generator, Any from unittest.mock import patch import shutil -import asyncio import pandas as pd import pytest @@ -25,8 +24,6 @@ from schematic.store.base import BaseStorage from schematic.store.synapse import DatasetFileView, ManifestDownload, SynapseStorage from schematic.utils.general import check_synapse_cache_size -from unittest.mock import AsyncMock -from synapseclient.models import Annotations logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) @@ -481,158 +478,6 @@ def test_get_files_metadata_from_dataset(self, synapse_store): "entityId": ["syn123", "syn456"], } - async def test_get_async_annotation(self, synapse_store: SynapseStorage) -> None: - """test get annotation async function""" - mock_syn_id = "syn1234" - - with patch( - "schematic.store.synapse.get_entity_id_bundle2", - new_callable=AsyncMock, - return_value="mock", - ) as mock_get_entity_id_bundle2: - mock_get_entity_id_bundle2.return_value = "mock" - result = await synapse_store.get_async_annotation(synapse_id=mock_syn_id) - - mock_get_entity_id_bundle2.assert_called_once_with( - entity_id=mock_syn_id, - request={"includeAnnotations": True}, - synapse_client=synapse_store.syn, - ) - assert result == "mock" - - async def test_store_async_annotation(self, synapse_store: SynapseStorage) -> None: - """test store annotations async function""" - annos_dict = { - "annotations": { - "id": "mock_syn_id", - "etag": "mock etag", - "annotations": { - "Id": {"type": "STRING", "value": ["mock value"]}, - "EntityId": {"type": "STRING", "value": ["mock_syn_id"]}, - "SampleID": {"type": "STRING", "value": [""]}, - "Component": {"type": "STRING", "value": ["mock value"]}, - }, - }, - "FileFormat": "mock format", - "Component": "mock component", - "Id": "mock_string", - "EntityId": "mock_id", - } - expected_dict = Annotations( - annotations={ - "Id": ["mock_string"], - "EntityId": ["mock_syn_id"], - "SampleID": [""], - "Component": ["mock value"], - "FileFormat": ["mock_format"], - }, - etag="mock etag", - id="mock syn_id", - ) - - with patch( - "schematic.store.synapse.Annotations.store_async", - new_callable=AsyncMock, - return_value=expected_dict, - ) as mock_store_async: - result = await synapse_store.store_async_annotation(annos_dict) - - mock_store_async.assert_called_once_with(synapse_store.syn) - assert result == expected_dict - assert isinstance(result, Annotations) - - async def test_process_store_annos_failure( - self, synapse_store: SynapseStorage - ) -> None: - """test _process_store_annos function when there's an error either getting or storing annotations""" - - async def mock_failure_coro(): - raise ValueError("sample error") - - # create tasks that will fail - tasks = set() - tasks.add(asyncio.create_task(mock_failure_coro())) - - synapse_store._process_store_annos - # make sure error message can be raised - with pytest.raises(RuntimeError, match="failed with"): - await synapse_store._process_store_annos(tasks) - - async def test_process_store_annos_success_store( - self, synapse_store: SynapseStorage - ) -> None: - """test _process_store_annos function and make sure that annotations can be stored after successfully getting annotations.""" - # mock annotation obtained after async_store - stored_annos = Annotations( - annotations={ - "Id": ["mock_string"], - "EntityId": ["mock_syn_id"], - "SampleID": [""], - "Component": ["mock value"], - "FileFormat": ["mock_format"], - }, - etag="mock etag", - id="mock_syn_id", - ) - - async def mock_success_coro(): - return stored_annos - - with patch( - "schematic.store.synapse.SynapseStorage.store_async_annotation", - new_callable=AsyncMock, - ) as mock_store_async1: - tasks = set() - tasks.add(asyncio.create_task(mock_success_coro())) - await synapse_store._process_store_annos(tasks) - # make sure that the if statement is working - mock_store_async1.assert_not_called() - - async def test_process_store_annos_success_get( - self, synapse_store: SynapseStorage - ) -> None: - """test _process_store_annos function and make sure that task of storing annotations can be triggered""" - # mock annotation obtained after get_async - mock_annos_dict = { - "annotations": { - "id": "mock_syn_id", - "etag": "mock etag", - "annotations": { - "Id": {"type": "STRING", "value": ["mock value"]}, - "EntityId": {"type": "STRING", "value": ["mock_syn_id"]}, - "SampleID": {"type": "STRING", "value": [""]}, - "Component": {"type": "STRING", "value": ["mock value"]}, - }, - }, - "FileFormat": "mock format", - "Component": "mock component", - "Id": "mock_string", - "EntityId": "mock_id", - } - - mock_stored_annos = Annotations( - annotations={ - "Id": ["mock_string"], - "EntityId": ["mock_syn_id"], - }, - etag="mock etag", - id="mock_syn_id", - ) - - async def mock_success_coro(): - return mock_annos_dict - - # make sure that the else statement is working - new_tasks = set() - with patch( - "schematic.store.synapse.SynapseStorage.store_async_annotation", - new_callable=AsyncMock, - return_value=mock_stored_annos, - ) as mock_store_async2: - new_tasks.add(asyncio.create_task(mock_success_coro())) - await synapse_store._process_store_annos(new_tasks) - mock_store_async2.assert_called_once() - class TestDatasetFileView: def test_init(self, dataset_id, dataset_fileview, synapse_store): @@ -1086,7 +931,7 @@ class TestManifestUpload: ), ], ) - async def test_add_annotations_to_entities_files( + def test_add_annotations_to_entities_files( self, synapse_store: SynapseStorage, dmge: DataModelGraphExplorer, @@ -1106,49 +951,27 @@ async def test_add_annotations_to_entities_files( expected_filenames (list(str)): expected list of file names expected_entity_ids (list(str)): expected list of entity ids """ - - async def mock_format_row_annos(): - return - - async def mock_process_store_annos(requests): - return - with patch( "schematic.store.synapse.SynapseStorage.getFilesInStorageDataset", return_value=files_in_dataset, ): - with patch( - "schematic.store.synapse.SynapseStorage.format_row_annotations", - return_value=mock_format_row_annos, - new_callable=AsyncMock, - ) as mock_format_row: - with patch( - "schematic.store.synapse.SynapseStorage._process_store_annos", - return_value=mock_process_store_annos, - new_callable=AsyncMock, - ) as mock_process_store: - manifest_df = pd.DataFrame(original_manifest) - - new_df = await synapse_store.add_annotations_to_entities_files( - dmge, - manifest_df, - manifest_record_type="entity", - datasetId="mock id", - hideBlanks=True, - ) + manifest_df = pd.DataFrame(original_manifest) - file_names_lst = new_df["Filename"].tolist() - entity_ids_lst = new_df["entityId"].tolist() - - # test entityId and Id columns get added - assert "entityId" in new_df.columns - assert "Id" in new_df.columns - assert file_names_lst == expected_filenames - assert entity_ids_lst == expected_entity_ids + new_df = synapse_store.add_annotations_to_entities_files( + dmge, + manifest_df, + manifest_record_type="entity", + datasetId="mock id", + hideBlanks=True, + ) + file_names_lst = new_df["Filename"].tolist() + entity_ids_lst = new_df["entityId"].tolist() - # make sure async function gets called as expected - assert mock_format_row.call_count == len(expected_entity_ids) - assert mock_process_store.call_count == 1 + # test entityId and Id columns get added + assert "entityId" in new_df.columns + assert "Id" in new_df.columns + assert file_names_lst == expected_filenames + assert entity_ids_lst == expected_entity_ids @pytest.mark.parametrize( "mock_manifest_file_path", @@ -1232,14 +1055,9 @@ def test_upload_manifest_as_csv( hide_blanks: bool, restrict: bool, ) -> None: - async def mock_add_annotations_to_entities_files(): - return - with ( patch( - "schematic.store.synapse.SynapseStorage.add_annotations_to_entities_files", - return_value=mock_add_annotations_to_entities_files, - new_callable=AsyncMock, + "schematic.store.synapse.SynapseStorage.add_annotations_to_entities_files" ) as add_anno_mock, patch( "schematic.store.synapse.SynapseStorage.upload_manifest_file", @@ -1287,19 +1105,13 @@ def test_upload_manifest_as_table( manifest_record_type: str, ) -> None: mock_df = pd.DataFrame() - - async def mock_add_annotations_to_entities_files(): - return - with ( patch( "schematic.store.synapse.SynapseStorage.uploadDB", return_value=["mock_table_id", mock_df, "mock_table_manifest"], ) as update_db_mock, patch( - "schematic.store.synapse.SynapseStorage.add_annotations_to_entities_files", - return_value=mock_add_annotations_to_entities_files, - new_callable=AsyncMock, + "schematic.store.synapse.SynapseStorage.add_annotations_to_entities_files" ) as add_anno_mock, patch( "schematic.store.synapse.SynapseStorage.upload_manifest_file", @@ -1353,19 +1165,13 @@ def test_upload_manifest_combo( mock_df = pd.DataFrame() manifest_path = helpers.get_data_path("mock_manifests/test_BulkRNAseq.csv") manifest_df = helpers.get_data_frame(manifest_path) - - async def mock_add_annotations_to_entities_files(): - return - with ( patch( "schematic.store.synapse.SynapseStorage.uploadDB", return_value=["mock_table_id", mock_df, "mock_table_manifest"], ) as update_db_mock, patch( - "schematic.store.synapse.SynapseStorage.add_annotations_to_entities_files", - return_value=mock_add_annotations_to_entities_files, - new_callable=AsyncMock, + "schematic.store.synapse.SynapseStorage.add_annotations_to_entities_files" ) as add_anno_mock, patch( "schematic.store.synapse.SynapseStorage.upload_manifest_file", From 391b6d2de42afc10b80a5c4b1baedcd884636ae0 Mon Sep 17 00:00:00 2001 From: linglp Date: Fri, 28 Jun 2024 23:15:27 -0400 Subject: [PATCH 110/110] add argument for isort to work with black --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8417f7295..16c72afc9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,3 +18,4 @@ repos: - id: isort name: isort (python) files: ^(tests|schematic|schematic_api)/ + args: ["--profile", "black", "--filter-files"]