From daf7b2aabc4dd36d12d5df497476a44f6317d3e4 Mon Sep 17 00:00:00 2001 From: Bilal Date: Mon, 24 Jun 2024 15:14:53 +0100 Subject: [PATCH 01/14] first attempt --- common/db.py | 28 +++--- common/schemas/genome.graphql | 2 + common/utils.py | 26 ++++++ graphql_service/resolver/data_loaders.py | 8 ++ graphql_service/resolver/exceptions.py | 24 +++++ graphql_service/resolver/gene_model.py | 106 +++++++++++++++++++++-- 6 files changed, 177 insertions(+), 17 deletions(-) diff --git a/common/db.py b/common/db.py index 95b5a093..83235f26 100644 --- a/common/db.py +++ b/common/db.py @@ -17,6 +17,8 @@ import grpc from ensembl.production.metadata.grpc import ensembl_metadata_pb2_grpc +from common.utils import process_release_version + logger = logging.getLogger(__name__) @@ -33,26 +35,30 @@ def __init__(self, config): self.config = config self.mongo_client = MongoDbClient.connect_mongo(self.config) - def get_database_conn(self, grpc_model, uuid): + def get_database_conn(self, grpc_model, uuid, force_grpc=False): grpc_response = None chosen_db = self.config.get("mongo_default_db") # Try to connect to gRPC try: grpc_response = grpc_model.get_release_by_genome_uuid(uuid) except Exception as grpc_exp: - # chosen_db value will fall back to the default value, which is 'mongo_default_db' that is in the config # TODO: check why "except graphql.error.graphql_error.GraphQLError as grpc_exp:" didn't catch the error logger.debug( "[get_database_conn] Couldn't connect to gRPC Host: %s", grpc_exp ) - if grpc_response: - logger.debug("[get_database_conn] grpc_response: %s", grpc_response) - # replacing '.' with '_' to avoid - # "pymongo.errors.InvalidName: database names cannot contain the character '.'" error ¯\_(ツ)_/¯ - release_version = str(grpc_response.release_version).replace(".", "_") - logger.debug("[get_database_conn] release_version: %s", release_version) - chosen_db = "release_" + release_version + if force_grpc: + chosen_db = process_release_version(grpc_response) + else: + if grpc_response and grpc_response.release_version: + chosen_db = process_release_version(grpc_response) + else: + # chosen_db value will fall back to the default value, which is 'mongo_default_db' that is in the config + # if force_grpc is not True + logger.warning( + "[get_database_conn] Falling back to the default Mongo DB: '%s'", + chosen_db, + ) logger.debug("[get_database_conn] Connected to '%s' MongoDB", chosen_db) data_database_connection = self.mongo_client[chosen_db] @@ -68,8 +74,8 @@ def connect_mongo(config): password = config.get("mongo_password") client = pymongo.MongoClient( - host, - port, + host=host, + port=port, username=user, password=password, read_preference=pymongo.ReadPreference.SECONDARY_PREFERRED, diff --git a/common/schemas/genome.graphql b/common/schemas/genome.graphql index db86757c..214b9189 100644 --- a/common/schemas/genome.graphql +++ b/common/schemas/genome.graphql @@ -36,4 +36,6 @@ type Genome { parlance_name: String genome_tag: String is_reference: Boolean! +# region: Region! + assembly: Assembly } \ No newline at end of file diff --git a/common/utils.py b/common/utils.py index 0c8a81d2..b711e431 100644 --- a/common/utils.py +++ b/common/utils.py @@ -12,6 +12,10 @@ limitations under the License. """ +import logging + +logger = logging.getLogger(__name__) + def check_config_validity(config): mandatory_fields = [ @@ -30,6 +34,28 @@ def check_config_validity(config): ) +def process_release_version(grpc_response): + """ + Processes the release version from the gRPC response and formats it for use as a database name. + + This function extracts the release version from the provided gRPC response, replaces any dots ('.') + with underscores ('_') to avoid pymongo.errors.InvalidName errors, and returns a formatted database + name string. + + Args: + grpc_response: The gRPC response object containing the release version. + + Returns: + str: A formatted string suitable for use as a database name, prefixed with 'release_'. + """ + logger.debug("[get_database_conn] grpc_response: %s", grpc_response) + # replacing '.' with '_' to avoid + # "pymongo.errors.InvalidName: database names cannot contain the character '.'" error ¯\_(ツ)_/¯ + release_version = str(grpc_response.release_version).replace(".", "_") + logger.debug("[get_database_conn] release_version: %s", release_version) + return "release_" + release_version + + def get_ensembl_metadata_api_version(): """ Get the Metadata API tag from requirement.txt file diff --git a/graphql_service/resolver/data_loaders.py b/graphql_service/resolver/data_loaders.py index 2714ed03..f193b74c 100644 --- a/graphql_service/resolver/data_loaders.py +++ b/graphql_service/resolver/data_loaders.py @@ -35,6 +35,9 @@ def __init__(self, database_conn): batch_load_fn=self.batch_region_by_assembly_load ) self.organism_loader = DataLoader(batch_load_fn=self.batch_organism_load) + self.assembly_by_genome_loader = DataLoader( + batch_load_fn=self.batch_assembly_by_genome_load + ) self.assembly_by_organism_loader = DataLoader( batch_load_fn=self.batch_assembly_by_organism_load ) @@ -83,6 +86,11 @@ async def batch_organism_load(self, keys: List[str]) -> List[List]: data = await self.query_mongo(query=query, doc_type="organism") return self.collate_dataloader_output("organism_primary_key", keys, data) + async def batch_assembly_by_genome_load(self, keys: List[str]) -> List[List]: + query = {"type": "Assembly", "assembly_id": {"$in": keys}} + data = await self.query_mongo(query=query, doc_type="assembly") + return self.collate_dataloader_output("assembly", keys, data) + async def batch_assembly_by_organism_load(self, keys: List[str]) -> List[List]: query = {"type": "Assembly", "organism_foreign_key": {"$in": keys}} data = await self.query_mongo(query=query, doc_type="assembly") diff --git a/graphql_service/resolver/exceptions.py b/graphql_service/resolver/exceptions.py index 4f400b05..ddbbee2b 100644 --- a/graphql_service/resolver/exceptions.py +++ b/graphql_service/resolver/exceptions.py @@ -13,6 +13,7 @@ """ from typing import Optional, Dict +import grpc from graphql import GraphQLError @@ -141,6 +142,15 @@ def __init__(self, organism_id): super().__init__("organism", {"organism_id": organism_id}) +class AssembliesFromGenomeNotFound(FieldNotFoundError): + """ + Custom error to be raised if we can't find the assemblies for a genome + """ + + def __init__(self, assembly_id): + super().__init__("assemblies", {"assembly_id": assembly_id}) + + class AssembliesFromOrganismNotFound(FieldNotFoundError): """ Custom error to be raised if we can't find the assemblies for an organism @@ -211,3 +221,17 @@ def __init__(self, message: str): message: The error message describing the missing argument. """ super().__init__(message) + + +class FailedToConnectToGrpc(grpc.RpcError): + """ + Exception raised when there is gRPC connection issue. + """ + + def __init__(self, message: str): + """Initializes a FailedToConnectToGrpc instance. + + Args: + message: The error message describing the issue. + """ + super().__init__(message) diff --git a/graphql_service/resolver/gene_model.py b/graphql_service/resolver/gene_model.py index ae2ca472..346101a6 100644 --- a/graphql_service/resolver/gene_model.py +++ b/graphql_service/resolver/gene_model.py @@ -15,6 +15,7 @@ import logging from typing import Dict, Optional, List, Any +import grpc from ariadne import QueryType, ObjectType from graphql import GraphQLResolveInfo from pymongo.database import Database @@ -39,6 +40,8 @@ GenomeNotFoundError, MissingArgumentException, DatabaseNotFoundError, + AssembliesFromGenomeNotFound, + FailedToConnectToGrpc, ) @@ -59,6 +62,7 @@ ORGANISM_TYPE = ObjectType("Organism") SPECIES_TYPE = ObjectType("Species") TRANSCRIPT_PAGE_TYPE = ObjectType("TranscriptsPage") +GENOME_TYPE = ObjectType("Genome") @QUERY_TYPE.field("gene") @@ -570,6 +574,56 @@ async def resolve_region_from_slice( return regions[0] +# @GENOME_TYPE.field("assembly") +# async def resolve_assembly_from_genome_uuid( +# genome: Dict, info: GraphQLResolveInfo +# ) -> Optional[Dict]: +# "Fetch an assembly referenced by a genome" +# print("&&&&&&&&& BBBOOOOOOOOOOOM !!!!!! &&&&&&&&&&") +# print(f"*********^^^^^^^^ genome ---> {genome}") +# # data_loader = get_data_loader(info) +# # loader = data_loader.organism_loader +# # assemblies = await loader.load(key=genome["assembly"]) +# # # assemblies = await loader.load(key=genome["assembly"]) +# # if not assemblies: +# # raise AssembliesFromGenomeNotFound(genome["assembly"]) +# # return assemblies[0] +# +# return {"assembly": "yoyoyoyo assembly"} + + +@GENOME_TYPE.field("assembly") +async def fetch_assembly_data( + genome_uuid: str, assembly_id: str, info: GraphQLResolveInfo +): + # query = {"type": "Assembly", "assembly_id": assembly_id} + query = {"type": "Assembly"} + + # connection_db = get_db_conn(info) + # assembly_collection = connection_db["assembly"] + # connection_db = info.context["mongo_db_client"].mongo_client + set_db_conn_for_uuid(info, genome_uuid) + connection_db = get_db_conn(info) + assembly_collection = connection_db.mongo_db["assembly"] + logger.info( + "[fetch_assembly_data] Getting Assembly from DB: '%s'", + connection_db.name, + ) + # assembly = assembly_collection.find_one(query) + try: + assembly = assembly_collection.find_one(query) + except Exception as e: + logging.error("Exception: %s", e) + raise DatabaseNotFoundError(db_name=connection_db.name) + + # print(f"assembly ---> {list(assembly)}") + # print(f"len assembly ---> {len(list(assembly))}") + if not assembly: + raise AssemblyNotFoundError(assembly_id) + # return assembly + return {"assembly": "yoyoyoyo assembly"} + + @REGION_TYPE.field("assembly") async def resolve_assembly_from_region( region: Dict, info: GraphQLResolveInfo @@ -708,11 +762,49 @@ def resolve_genomes( result = grpc_model.get_genome_by_keyword( by_keyword.get("keyword"), by_keyword.get("release_version") ) - genomes = list(result) - if not genomes: - raise GenomeNotFoundError(by_keyword) - genomes = list(map(create_genome_response, genomes)) - return genomes + try: + genomes = list(result) + if not genomes: + raise GenomeNotFoundError(by_keyword) + genomes = [create_genome_response(genome, info) for genome in genomes] + return genomes + + except grpc.RpcError as e: + if e.code() == grpc.StatusCode.UNAVAILABLE: + msg = "Error: Failed to connect to the remote host. Connection refused." + logger.error(msg) + raise FailedToConnectToGrpc(msg) + else: + # Print the default error message for any other gRPC errors + msg = f"gRPC Error: {e.details()}" + logger.error(msg) + raise FailedToConnectToGrpc(msg) + + # print(f"$$$$$$$$$$ genome ---> {genomes}") + + # print("===========================") + # # print(f"{[genome['genome_id'] for genome in genomes]}") + # print(f"{genomes[0]['genome_id']}") + # print("===========================") + # + # for genome in genomes: + # query = {"genome_id": genome["genome_id"]} + # set_db_conn_for_uuid(info, genome["genome_id"]) + # connection_db = get_db_conn(info) + # region_collection = connection_db["region"] + # logger.info( + # "[resolve_genomes] Getting Region from DB: '%s'", connection_db.name + # ) + # + # result = region_collection.find_one(query) + # print("===========================") + # # print(f"{[genome['genome_id'] for genome in genomes]}") + # print(f"{result}") + # print("===========================") + + # return genomes + # print((f"*********^^^^^^^^ genomes[0] ---> {genomes[0]}")) + # return result if by_assembly_accession_id: result = grpc_model.get_genome_by_assembly_acc_id( @@ -741,7 +833,8 @@ def resolve_genome(_, info: GraphQLResolveInfo, by_genome_uuid: Dict[str, str]) return genomes -def create_genome_response(genome): +def create_genome_response(genome, info): + print(f"*********^^^^^^^^ genome ---> {genome}") response = { "genome_id": genome.genome_uuid, "assembly_accession": genome.assembly.accession, @@ -752,6 +845,7 @@ def create_genome_response(genome): "parlance_name": genome.organism.scientific_parlance_name, "genome_tag": genome.assembly.url_name if not None else genome.assembly.tol_id, "is_reference": genome.assembly.is_reference, + "assembly": fetch_assembly_data(genome.genome_uuid, genome.assembly.name, info), } return response From 8d6f18cd7c140160ddc462dc5e25e964be59f89e Mon Sep 17 00:00:00 2001 From: Bilal Date: Tue, 25 Jun 2024 10:01:20 +0100 Subject: [PATCH 02/14] clean up and refactor the newely added feature's code --- common/schemas/genome.graphql | 1 - graphql_service/resolver/exceptions.py | 11 ++ graphql_service/resolver/gene_model.py | 144 +++++++------------------ 3 files changed, 52 insertions(+), 104 deletions(-) diff --git a/common/schemas/genome.graphql b/common/schemas/genome.graphql index 214b9189..61c9a883 100644 --- a/common/schemas/genome.graphql +++ b/common/schemas/genome.graphql @@ -36,6 +36,5 @@ type Genome { parlance_name: String genome_tag: String is_reference: Boolean! -# region: Region! assembly: Assembly } \ No newline at end of file diff --git a/graphql_service/resolver/exceptions.py b/graphql_service/resolver/exceptions.py index ddbbee2b..4eacd1d6 100644 --- a/graphql_service/resolver/exceptions.py +++ b/graphql_service/resolver/exceptions.py @@ -28,6 +28,17 @@ def __init__(self, db_name: str): super().__init__(message, extensions=self.extensions) +class CollectionNotFoundError(GraphQLError): + """ + Custom error to be raised if collection is not found + """ + + def __init__(self, collection_name: str): + self.extensions = {"code": f"COLLECTION_NOT_FOUND"} + message = f"Failed to find collection: {collection_name}" + super().__init__(message, extensions=self.extensions) + + class FieldNotFoundError(GraphQLError): """ Custom error to be raised if a field cannot be found by id diff --git a/graphql_service/resolver/gene_model.py b/graphql_service/resolver/gene_model.py index 346101a6..88906ced 100644 --- a/graphql_service/resolver/gene_model.py +++ b/graphql_service/resolver/gene_model.py @@ -13,12 +13,11 @@ """ import configparser import logging -from typing import Dict, Optional, List, Any +from typing import Dict, Optional, List, Any, Mapping -import grpc from ariadne import QueryType, ObjectType from graphql import GraphQLResolveInfo -from pymongo.database import Database +from pymongo.database import Database, Collection from graphql_service.resolver.data_loaders import BatchLoaders @@ -40,8 +39,7 @@ GenomeNotFoundError, MissingArgumentException, DatabaseNotFoundError, - AssembliesFromGenomeNotFound, - FailedToConnectToGrpc, + CollectionNotFoundError, ) @@ -574,56 +572,6 @@ async def resolve_region_from_slice( return regions[0] -# @GENOME_TYPE.field("assembly") -# async def resolve_assembly_from_genome_uuid( -# genome: Dict, info: GraphQLResolveInfo -# ) -> Optional[Dict]: -# "Fetch an assembly referenced by a genome" -# print("&&&&&&&&& BBBOOOOOOOOOOOM !!!!!! &&&&&&&&&&") -# print(f"*********^^^^^^^^ genome ---> {genome}") -# # data_loader = get_data_loader(info) -# # loader = data_loader.organism_loader -# # assemblies = await loader.load(key=genome["assembly"]) -# # # assemblies = await loader.load(key=genome["assembly"]) -# # if not assemblies: -# # raise AssembliesFromGenomeNotFound(genome["assembly"]) -# # return assemblies[0] -# -# return {"assembly": "yoyoyoyo assembly"} - - -@GENOME_TYPE.field("assembly") -async def fetch_assembly_data( - genome_uuid: str, assembly_id: str, info: GraphQLResolveInfo -): - # query = {"type": "Assembly", "assembly_id": assembly_id} - query = {"type": "Assembly"} - - # connection_db = get_db_conn(info) - # assembly_collection = connection_db["assembly"] - # connection_db = info.context["mongo_db_client"].mongo_client - set_db_conn_for_uuid(info, genome_uuid) - connection_db = get_db_conn(info) - assembly_collection = connection_db.mongo_db["assembly"] - logger.info( - "[fetch_assembly_data] Getting Assembly from DB: '%s'", - connection_db.name, - ) - # assembly = assembly_collection.find_one(query) - try: - assembly = assembly_collection.find_one(query) - except Exception as e: - logging.error("Exception: %s", e) - raise DatabaseNotFoundError(db_name=connection_db.name) - - # print(f"assembly ---> {list(assembly)}") - # print(f"len assembly ---> {len(list(assembly))}") - if not assembly: - raise AssemblyNotFoundError(assembly_id) - # return assembly - return {"assembly": "yoyoyoyo assembly"} - - @REGION_TYPE.field("assembly") async def resolve_assembly_from_region( region: Dict, info: GraphQLResolveInfo @@ -762,49 +710,12 @@ def resolve_genomes( result = grpc_model.get_genome_by_keyword( by_keyword.get("keyword"), by_keyword.get("release_version") ) - try: - genomes = list(result) - if not genomes: - raise GenomeNotFoundError(by_keyword) - genomes = [create_genome_response(genome, info) for genome in genomes] - return genomes - - except grpc.RpcError as e: - if e.code() == grpc.StatusCode.UNAVAILABLE: - msg = "Error: Failed to connect to the remote host. Connection refused." - logger.error(msg) - raise FailedToConnectToGrpc(msg) - else: - # Print the default error message for any other gRPC errors - msg = f"gRPC Error: {e.details()}" - logger.error(msg) - raise FailedToConnectToGrpc(msg) - - # print(f"$$$$$$$$$$ genome ---> {genomes}") - - # print("===========================") - # # print(f"{[genome['genome_id'] for genome in genomes]}") - # print(f"{genomes[0]['genome_id']}") - # print("===========================") - # - # for genome in genomes: - # query = {"genome_id": genome["genome_id"]} - # set_db_conn_for_uuid(info, genome["genome_id"]) - # connection_db = get_db_conn(info) - # region_collection = connection_db["region"] - # logger.info( - # "[resolve_genomes] Getting Region from DB: '%s'", connection_db.name - # ) - # - # result = region_collection.find_one(query) - # print("===========================") - # # print(f"{[genome['genome_id'] for genome in genomes]}") - # print(f"{result}") - # print("===========================") - - # return genomes - # print((f"*********^^^^^^^^ genomes[0] ---> {genomes[0]}")) - # return result + genomes = list(result) + if not genomes: + raise GenomeNotFoundError(by_keyword) + + # Fetch assembly data and combine it with genome data + return fetch_and_combine_genome_data(info, genomes) if by_assembly_accession_id: result = grpc_model.get_genome_by_assembly_acc_id( @@ -813,8 +724,9 @@ def resolve_genomes( genomes = list(result) if not genomes: raise GenomeNotFoundError(by_assembly_accession_id) - genomes = list(map(create_genome_response, genomes)) - return genomes + + # Fetch assembly data and combine it with genome data + return fetch_and_combine_genome_data(info, genomes) return [] @@ -833,8 +745,7 @@ def resolve_genome(_, info: GraphQLResolveInfo, by_genome_uuid: Dict[str, str]) return genomes -def create_genome_response(genome, info): - print(f"*********^^^^^^^^ genome ---> {genome}") +def create_genome_response(genome, assembly_data=None): response = { "genome_id": genome.genome_uuid, "assembly_accession": genome.assembly.accession, @@ -845,11 +756,38 @@ def create_genome_response(genome, info): "parlance_name": genome.organism.scientific_parlance_name, "genome_tag": genome.assembly.url_name if not None else genome.assembly.tol_id, "is_reference": genome.assembly.is_reference, - "assembly": fetch_assembly_data(genome.genome_uuid, genome.assembly.name, info), + "assembly": assembly_data, } return response +def fetch_assembly_data(assembly_collection: Collection, assembly_id: str) -> Mapping: + query = {"assembly_id": assembly_id} + try: + assembly = assembly_collection.find_one(query) + except Exception as e: + logging.error("Exception: %s", e) + raise CollectionNotFoundError(collection_name=assembly_collection.name) + + if not assembly: + raise AssemblyNotFoundError(assembly_id) + return assembly + + +def fetch_and_combine_genome_data(info: GraphQLResolveInfo, genomes: List) -> List: + combined_results = [] + for genome in genomes: + set_db_conn_for_uuid(info, genome.genome_uuid) + connection_db = get_db_conn(info) + # logging.debug("Collections in the database:", connection_db.list_collection_names()) + assembly_collection = connection_db["assembly"] + # logging.debug("assembly_collection.name:", assembly_collection.name) + + assembly_data = fetch_assembly_data(assembly_collection, genome.assembly.name) + combined_results.append(create_genome_response(genome, assembly_data)) + return combined_results + + def get_version_details() -> Dict[str, str]: """ Fetch version details from a 'version_config.ini' file. From b8732ea137cb4c6a11fc276ba38bb3ed7468cf19 Mon Sep 17 00:00:00 2001 From: Bilal Date: Tue, 25 Jun 2024 13:51:15 +0100 Subject: [PATCH 03/14] optimise the script by checking if assembly related data is requested or not --- graphql_service/resolver/gene_model.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/graphql_service/resolver/gene_model.py b/graphql_service/resolver/gene_model.py index 88906ced..c2ba1902 100644 --- a/graphql_service/resolver/gene_model.py +++ b/graphql_service/resolver/gene_model.py @@ -775,6 +775,12 @@ def fetch_assembly_data(assembly_collection: Collection, assembly_id: str) -> Ma def fetch_and_combine_genome_data(info: GraphQLResolveInfo, genomes: List) -> List: + # Check if the assembly field is requested in the query + requested_fields = [ + field.name.value for field in info.field_nodes[0].selection_set.selections + ] + is_assembly_prensent = "assembly" in requested_fields + combined_results = [] for genome in genomes: set_db_conn_for_uuid(info, genome.genome_uuid) @@ -782,9 +788,17 @@ def fetch_and_combine_genome_data(info: GraphQLResolveInfo, genomes: List) -> Li # logging.debug("Collections in the database:", connection_db.list_collection_names()) assembly_collection = connection_db["assembly"] # logging.debug("assembly_collection.name:", assembly_collection.name) - - assembly_data = fetch_assembly_data(assembly_collection, genome.assembly.name) - combined_results.append(create_genome_response(genome, assembly_data)) + logging.debug("is_assembly_prensent:", is_assembly_prensent) + + if not is_assembly_prensent: + # Don't bother getting the assembly data if it's not requested in the query + # TODO: See if this can be approved + combined_results.append(create_genome_response(genome, None)) + else: + assembly_data = fetch_assembly_data( + assembly_collection, genome.assembly.name + ) + combined_results.append(create_genome_response(genome, assembly_data)) return combined_results From 0a50ce4b736bef39623ba6f9836119e4635d4a9a Mon Sep 17 00:00:00 2001 From: Bilal Date: Tue, 25 Jun 2024 14:06:26 +0100 Subject: [PATCH 04/14] fix tests --- .../tests/snapshots/snap_test_gene_name_retrieval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/graphql_service/tests/snapshots/snap_test_gene_name_retrieval.py b/graphql_service/tests/snapshots/snap_test_gene_name_retrieval.py index 87f6deb9..ca01b9be 100644 --- a/graphql_service/tests/snapshots/snap_test_gene_name_retrieval.py +++ b/graphql_service/tests/snapshots/snap_test_gene_name_retrieval.py @@ -27,7 +27,7 @@ "name": { "accession_id": "A0A1D5TR86", "source": None, - "url": "https://purl.uniprot.org/uniprot/A0A1D5TR86", + "url": "http://purl.uniprot.org/uniprot/A0A1D5TR86", "value": "Sulfotransferase [Source:UniProtKB/TrEMBL;Acc:A0A1D5TR86]", } }, @@ -67,7 +67,7 @@ "release": None, "url": "https://www.uniprot.org/", }, - "url": "https://purl.uniprot.org/uniprot/A0A1D5TR86", + "url": "http://purl.uniprot.org/uniprot/A0A1D5TR86", "value": None, } }, From f8d588542fd6510d024f4f8af7dffd40a064d87a Mon Sep 17 00:00:00 2001 From: Bilal Date: Tue, 25 Jun 2024 14:22:46 +0100 Subject: [PATCH 05/14] fix pylint --- graphql_service/resolver/exceptions.py | 4 ++-- graphql_service/resolver/gene_model.py | 27 +++++++++++++------------- graphql_service/server.py | 2 +- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/graphql_service/resolver/exceptions.py b/graphql_service/resolver/exceptions.py index 4eacd1d6..aeed2081 100644 --- a/graphql_service/resolver/exceptions.py +++ b/graphql_service/resolver/exceptions.py @@ -23,7 +23,7 @@ class DatabaseNotFoundError(GraphQLError): """ def __init__(self, db_name: str): - self.extensions = {"code": f"DATABASE_NOT_FOUND"} + self.extensions = {"code": "DATABASE_NOT_FOUND"} message = f"Failed to find database: {db_name}" super().__init__(message, extensions=self.extensions) @@ -34,7 +34,7 @@ class CollectionNotFoundError(GraphQLError): """ def __init__(self, collection_name: str): - self.extensions = {"code": f"COLLECTION_NOT_FOUND"} + self.extensions = {"code": "COLLECTION_NOT_FOUND"} message = f"Failed to find collection: {collection_name}" super().__init__(message, extensions=self.extensions) diff --git a/graphql_service/resolver/gene_model.py b/graphql_service/resolver/gene_model.py index c2ba1902..d88a6f22 100644 --- a/graphql_service/resolver/gene_model.py +++ b/graphql_service/resolver/gene_model.py @@ -99,9 +99,9 @@ def resolve_gene( logger.info("[resolve_gene] Getting Gene from DB: '%s'", connection_db.name) try: result = gene_collection.find_one(query) - except Exception as e: - logging.error("Exception: %s", e) - raise DatabaseNotFoundError(db_name=connection_db.name) + except Exception as db_exp: + logging.error("Exception: %s", db_exp) + raise (DatabaseNotFoundError(db_name=connection_db.name)) from db_exp if not result: raise GeneNotFoundError(by_id=by_id) @@ -126,9 +126,9 @@ def resolve_genes(_, info: GraphQLResolveInfo, by_symbol: Dict[str, str]) -> Lis try: result = gene_collection.find(query) - except Exception as e: - logging.error("Exception: %s", e) - raise DatabaseNotFoundError(db_name=connection_db.name) + except Exception as db_exp: + logging.error("Exception: %s", db_exp) + raise (DatabaseNotFoundError(db_name=connection_db.name)) from db_exp # unpack cursor into a list. We're guaranteed relatively small results result = list(result) @@ -247,9 +247,9 @@ def resolve_transcript( try: transcript = transcript_collection.find_one(query) - except Exception as e: - logging.error("Exception: %s", e) - raise DatabaseNotFoundError(db_name=connection_db.name) + except Exception as db_exp: + logging.error("Exception: %s", db_exp) + raise (DatabaseNotFoundError(db_name=connection_db.name)) from db_exp if not transcript: raise TranscriptNotFoundError(by_symbol=by_symbol, by_id=by_id) @@ -765,9 +765,11 @@ def fetch_assembly_data(assembly_collection: Collection, assembly_id: str) -> Ma query = {"assembly_id": assembly_id} try: assembly = assembly_collection.find_one(query) - except Exception as e: - logging.error("Exception: %s", e) - raise CollectionNotFoundError(collection_name=assembly_collection.name) + except Exception as coll_exp: + logging.error("Exception: %s", coll_exp) + raise ( + CollectionNotFoundError(collection_name=assembly_collection.name) + ) from coll_exp if not assembly: raise AssemblyNotFoundError(assembly_id) @@ -788,7 +790,6 @@ def fetch_and_combine_genome_data(info: GraphQLResolveInfo, genomes: List) -> Li # logging.debug("Collections in the database:", connection_db.list_collection_names()) assembly_collection = connection_db["assembly"] # logging.debug("assembly_collection.name:", assembly_collection.name) - logging.debug("is_assembly_prensent:", is_assembly_prensent) if not is_assembly_prensent: # Don't bother getting the assembly data if it's not requested in the query diff --git a/graphql_service/server.py b/graphql_service/server.py index d02277bc..e6e6a317 100644 --- a/graphql_service/server.py +++ b/graphql_service/server.py @@ -25,13 +25,13 @@ from starlette import applications, middleware from starlette.middleware.cors import CORSMiddleware +from dotenv import load_dotenv from common import crossrefs, db, extensions, utils, logger from grpc_service import grpc_model from graphql_service.ariadne_app import ( prepare_executable_schema, prepare_context_provider, ) -from dotenv import load_dotenv load_dotenv("connections.conf") From 7fb9dadb3ab2c9046cda90fcce6e3eb42819db55 Mon Sep 17 00:00:00 2001 From: Bilal Date: Tue, 25 Jun 2024 14:29:36 +0100 Subject: [PATCH 06/14] remove uneccessary code --- graphql_service/resolver/data_loaders.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/graphql_service/resolver/data_loaders.py b/graphql_service/resolver/data_loaders.py index f193b74c..2714ed03 100644 --- a/graphql_service/resolver/data_loaders.py +++ b/graphql_service/resolver/data_loaders.py @@ -35,9 +35,6 @@ def __init__(self, database_conn): batch_load_fn=self.batch_region_by_assembly_load ) self.organism_loader = DataLoader(batch_load_fn=self.batch_organism_load) - self.assembly_by_genome_loader = DataLoader( - batch_load_fn=self.batch_assembly_by_genome_load - ) self.assembly_by_organism_loader = DataLoader( batch_load_fn=self.batch_assembly_by_organism_load ) @@ -86,11 +83,6 @@ async def batch_organism_load(self, keys: List[str]) -> List[List]: data = await self.query_mongo(query=query, doc_type="organism") return self.collate_dataloader_output("organism_primary_key", keys, data) - async def batch_assembly_by_genome_load(self, keys: List[str]) -> List[List]: - query = {"type": "Assembly", "assembly_id": {"$in": keys}} - data = await self.query_mongo(query=query, doc_type="assembly") - return self.collate_dataloader_output("assembly", keys, data) - async def batch_assembly_by_organism_load(self, keys: List[str]) -> List[List]: query = {"type": "Assembly", "organism_foreign_key": {"$in": keys}} data = await self.query_mongo(query=query, doc_type="assembly") From b11e627eb57235ccd952a8c9b9a4e61494f58133 Mon Sep 17 00:00:00 2001 From: Bilal Date: Mon, 8 Jul 2024 16:27:44 +0100 Subject: [PATCH 07/14] add release_date --- README.md | 11 ++++++++--- common/schemas/genome.graphql | 1 + graphql_service/resolver/gene_model.py | 3 ++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 31069005..8e038a99 100644 --- a/README.md +++ b/README.md @@ -21,11 +21,16 @@ To install dependencies, run: Rename example_connections.conf to connections.conf and update the config values accordingly. This command will start the server: +``` +uvicorn --workers 1 --host=0.0.0.0 graphql_service.server:APP +``` -```uvicorn --workers 1 --host=0.0.0.0 graphql_service.server:APP``` - +To run a Uvicorn server with automatic reload for development purposes, you can use the --reload flag. This flag will make Uvicorn watch your code for changes and automatically restart the server when it detects any changes. +``` +uvicorn --workers 1 --host 0.0.0.0 --reload graphql_service.server:APP +``` -If you're developing in PyCharm, you will probably find it useful to create a run +Also, if you're developing in PyCharm, you will probably find it useful to create a run configuration so that you can use the debugger. Create a run configuration that looks like this: diff --git a/common/schemas/genome.graphql b/common/schemas/genome.graphql index 61c9a883..56ac5b1f 100644 --- a/common/schemas/genome.graphql +++ b/common/schemas/genome.graphql @@ -31,6 +31,7 @@ type Genome { assembly_accession: String! scientific_name: String! release_number: Float! + release_date: String! taxon_id: Int! tol_id: String parlance_name: String diff --git a/graphql_service/resolver/gene_model.py b/graphql_service/resolver/gene_model.py index d88a6f22..0add8705 100644 --- a/graphql_service/resolver/gene_model.py +++ b/graphql_service/resolver/gene_model.py @@ -751,6 +751,7 @@ def create_genome_response(genome, assembly_data=None): "assembly_accession": genome.assembly.accession, "scientific_name": genome.organism.scientific_name, "release_number": genome.release.release_version, + "release_date": genome.release.release_date, "taxon_id": genome.taxon.taxonomy_id, "tol_id": genome.assembly.tol_id, "parlance_name": genome.organism.scientific_parlance_name, @@ -793,7 +794,7 @@ def fetch_and_combine_genome_data(info: GraphQLResolveInfo, genomes: List) -> Li if not is_assembly_prensent: # Don't bother getting the assembly data if it's not requested in the query - # TODO: See if this can be approved + # TODO: See if this can be improved combined_results.append(create_genome_response(genome, None)) else: assembly_data = fetch_assembly_data( From 1404283031edfd71552835a7a26ecb7880d200ec Mon Sep 17 00:00:00 2001 From: Bilal Date: Mon, 15 Jul 2024 18:19:40 +0100 Subject: [PATCH 08/14] Alter by_keyword arguments to be more specific --- .gitignore | 1 + common/schemas/query.graphql | 17 +++- graphql_service/resolver/gene_model.py | 133 +++++++++++++------------ grpc_service/grpc_model.py | 51 ++++++---- 4 files changed, 119 insertions(+), 83 deletions(-) diff --git a/.gitignore b/.gitignore index 4486dc0a..a9c4459b 100644 --- a/.gitignore +++ b/.gitignore @@ -64,6 +64,7 @@ venv/ ENV/ env.bak/ venv.bak/ +node_modules/ # editor cruft .vscode diff --git a/common/schemas/query.graphql b/common/schemas/query.graphql index c02aa987..cf58bfcf 100644 --- a/common/schemas/query.graphql +++ b/common/schemas/query.graphql @@ -18,8 +18,8 @@ type Query { by_slice: SliceInput): Locus region(by_name: RegionNameInput!): Region - genomes(by_keyword: GenomeByKeywordInput, - by_assembly_accession_id: AssemblyAccessionIDInput): [Genome] + + genomes(by_keyword: GenomeBySpecificKeywordInput): [Genome] genome(by_genome_uuid: GenomeUUIDInput!): Genome @@ -58,9 +58,16 @@ input GenomeUUIDInput { release_version: Float } -input GenomeByKeywordInput { - keyword: String! - release_version: Float +input GenomeBySpecificKeywordInput{ + tolid: String + assembly_accession_id: String + assembly_name: String + ensembl_name: String + common_name: String + scientific_name: String + scientific_parlance_name: String + species_taxonomy_id: String + release_version: Float } input AssemblyAccessionIDInput { diff --git a/graphql_service/resolver/gene_model.py b/graphql_service/resolver/gene_model.py index 0add8705..80dd2bef 100644 --- a/graphql_service/resolver/gene_model.py +++ b/graphql_service/resolver/gene_model.py @@ -16,7 +16,7 @@ from typing import Dict, Optional, List, Any, Mapping from ariadne import QueryType, ObjectType -from graphql import GraphQLResolveInfo +from graphql import GraphQLResolveInfo, GraphQLError from pymongo.database import Database, Collection from graphql_service.resolver.data_loaders import BatchLoaders @@ -686,47 +686,85 @@ async def resolve_region(_, info: GraphQLResolveInfo, by_name: Dict[str, str]) - return result +def fetch_genome_and_combine(info, grpc_model, by_keyword, key): + """ + Fetches genomes by a specific keyword and combines genome data with assembly data if requested. + + Args: + info (GraphQLResolveInfo): The GraphQL resolver information containing the field nodes and other query details. + grpc_model: The gRPC model to fetch genome data. + by_keyword (dict): Dictionary containing the keyword to search genomes by. + key (str): The specific key to use for fetching genomes. + + Returns: + List: A list of combined genome and assembly data objects. If assembly data is not requested, only genome data is included. + + Raises: + GenomeNotFoundError: If no genomes are found for the given keyword. + """ + result = grpc_model.get_genome_by_specific_keyword( + **{key: by_keyword.get(key)}, + release_version=by_keyword.get("release_version"), + ) + genomes = list(result) + if not genomes: + raise GenomeNotFoundError(by_keyword) + + # Check if the assembly field is requested in the query + requested_fields = [ + field.name.value for field in info.field_nodes[0].selection_set.selections + ] + is_assembly_prensent = "assembly" in requested_fields + + combined_results = [] + for genome in genomes: + set_db_conn_for_uuid(info, genome.genome_uuid) + connection_db = get_db_conn(info) + # logging.debug("Collections in the database:", connection_db.list_collection_names()) + assembly_collection = connection_db["assembly"] + # logging.debug("assembly_collection.name:", assembly_collection.name) + + if not is_assembly_prensent: + # Don't bother getting the assembly data if it's not requested in the query + # TODO: See if this can be improved + combined_results.append(create_genome_response(genome, None)) + else: + assembly_data = fetch_assembly_data( + assembly_collection, genome.assembly.name + ) + combined_results.append(create_genome_response(genome, assembly_data)) + return combined_results + + @QUERY_TYPE.field("genomes") def resolve_genomes( - _, - info: GraphQLResolveInfo, - by_keyword: Optional[Dict[str, str]] = None, - by_assembly_accession_id: Optional[Dict[str, str]] = None, + _, info: GraphQLResolveInfo, by_keyword: Optional[Dict[str, str]] = None ) -> List: - # in case the user provides both arguments or none - if sum(map(bool, [by_keyword, by_assembly_accession_id])) != 1: - # ask them to provide at least one argument - if not by_keyword and not by_assembly_accession_id: - raise MissingArgumentException( - "You must provide either 'by_keyword' or 'by_assembly_accession_id' argument." - ) - # or in case they provided both, ask them to provide one only - raise InputFieldArgumentNumberError(1) + # ask them to provide at least one argument + if not by_keyword: + raise MissingArgumentException("You must provide 'by_keyword' argument.") + + # Check if exactly one field is provided + provided_count = sum(1 for value in by_keyword.values() if value) + if provided_count != 1: + raise GraphQLError("Exactly one of the fields must be provided") grpc_model = info.context["grpc_model"] if by_keyword: - result = grpc_model.get_genome_by_keyword( - by_keyword.get("keyword"), by_keyword.get("release_version") - ) - genomes = list(result) - if not genomes: - raise GenomeNotFoundError(by_keyword) - - # Fetch assembly data and combine it with genome data - return fetch_and_combine_genome_data(info, genomes) - - if by_assembly_accession_id: - result = grpc_model.get_genome_by_assembly_acc_id( - by_assembly_accession_id.get("assembly_accession_id") - ) - genomes = list(result) - if not genomes: - raise GenomeNotFoundError(by_assembly_accession_id) - - # Fetch assembly data and combine it with genome data - return fetch_and_combine_genome_data(info, genomes) + for key in [ + "tolid", + "assembly_accession_id", + "assembly_name", + "ensembl_name", + "common_name", + "scientific_name", + "scientific_parlance_name", + "species_taxonomy_id", + ]: + if by_keyword.get(key): + return fetch_genome_and_combine(info, grpc_model, by_keyword, key) return [] @@ -777,33 +815,6 @@ def fetch_assembly_data(assembly_collection: Collection, assembly_id: str) -> Ma return assembly -def fetch_and_combine_genome_data(info: GraphQLResolveInfo, genomes: List) -> List: - # Check if the assembly field is requested in the query - requested_fields = [ - field.name.value for field in info.field_nodes[0].selection_set.selections - ] - is_assembly_prensent = "assembly" in requested_fields - - combined_results = [] - for genome in genomes: - set_db_conn_for_uuid(info, genome.genome_uuid) - connection_db = get_db_conn(info) - # logging.debug("Collections in the database:", connection_db.list_collection_names()) - assembly_collection = connection_db["assembly"] - # logging.debug("assembly_collection.name:", assembly_collection.name) - - if not is_assembly_prensent: - # Don't bother getting the assembly data if it's not requested in the query - # TODO: See if this can be improved - combined_results.append(create_genome_response(genome, None)) - else: - assembly_data = fetch_assembly_data( - assembly_collection, genome.assembly.name - ) - combined_results.append(create_genome_response(genome, assembly_data)) - return combined_results - - def get_version_details() -> Dict[str, str]: """ Fetch version details from a 'version_config.ini' file. diff --git a/grpc_service/grpc_model.py b/grpc_service/grpc_model.py index ff2f59a9..9ff3a887 100644 --- a/grpc_service/grpc_model.py +++ b/grpc_service/grpc_model.py @@ -21,27 +21,44 @@ def get_genome_by_genome_uuid(self, genome_uuid, release_version=None): response = self.grpc_stub.GetGenomeByUUID(request) return response - def get_genome_by_keyword(self, keyword, release_version=None): + def get_genome_by_specific_keyword( + self, + tolid=None, + assembly_accession_id=None, + assembly_name=None, + ensembl_name=None, + common_name=None, + scientific_name=None, + scientific_parlance_name=None, + species_taxonomy_id=None, + release_version=None, + ): logger.debug( - "Received RPC for GetGenomesByKeyword with keyword: '%s', release: %s", - keyword, - release_version, - ) - request = ensembl_metadata_pb2.GenomeByKeywordRequest( - keyword=keyword, release_version=release_version - ) - response = self.grpc_stub.GetGenomesByKeyword(request) - return response - - def get_genome_by_assembly_acc_id(self, assembly_accession_id): - logger.debug( - "Received RPC for GetGenomesByAssemblyAccessionID with assembly_accession_id: '%s'", + "Received RPC for GetGenomesBySpecificKeyword with tolid: '%s', assembly_accession_id: '%s', " + "assembly_name: '%s', ensembl_name: '%s', common_name: '%s', scientific_name: '%s', " + "scientific_parlance_name: '%s', species_taxonomy_id: '%s', release: %s", + tolid, assembly_accession_id, + assembly_name, + ensembl_name, + common_name, + scientific_name, + scientific_parlance_name, + species_taxonomy_id, + release_version, ) - request = ensembl_metadata_pb2.AssemblyAccessionIDRequest( - assembly_accession=assembly_accession_id + request = ensembl_metadata_pb2.GenomeBySpecificKeywordRequest( + tolid=tolid, + assembly_accession_id=assembly_accession_id, + assembly_name=assembly_name, + ensembl_name=ensembl_name, + common_name=common_name, + scientific_name=scientific_name, + scientific_parlance_name=scientific_parlance_name, + species_taxonomy_id=species_taxonomy_id, + release_version=release_version, ) - response = self.grpc_stub.GetGenomesByAssemblyAccessionID(request) + response = self.grpc_stub.GetGenomesBySpecificKeyword(request) return response def get_release_by_genome_uuid(self, genome_uuid): From 2b7d980cb0d825e5d2070eb2059305cfade515f7 Mon Sep 17 00:00:00 2001 From: Bilal Date: Wed, 17 Jul 2024 11:52:24 +0100 Subject: [PATCH 09/14] include dataset info in 'genomes' and 'genome' queries --- common/schemas/dataset.graphql | 11 +++++ common/schemas/genome.graphql | 1 + graphql_service/resolver/gene_model.py | 65 +++++++++++++++++++++----- grpc_service/grpc_model.py | 12 +++++ 4 files changed, 77 insertions(+), 12 deletions(-) create mode 100644 common/schemas/dataset.graphql diff --git a/common/schemas/dataset.graphql b/common/schemas/dataset.graphql new file mode 100644 index 00000000..cfe7de66 --- /dev/null +++ b/common/schemas/dataset.graphql @@ -0,0 +1,11 @@ +type Dataset { + dataset_id: String! + name: String! + release: Float! + type: String! + source: String! + dataset_type: String! + version: String + release_date: String! + release_type: String! +} \ No newline at end of file diff --git a/common/schemas/genome.graphql b/common/schemas/genome.graphql index 56ac5b1f..27df17f9 100644 --- a/common/schemas/genome.graphql +++ b/common/schemas/genome.graphql @@ -38,4 +38,5 @@ type Genome { genome_tag: String is_reference: Boolean! assembly: Assembly + dataset: [Dataset] } \ No newline at end of file diff --git a/graphql_service/resolver/gene_model.py b/graphql_service/resolver/gene_model.py index 80dd2bef..ad272ff9 100644 --- a/graphql_service/resolver/gene_model.py +++ b/graphql_service/resolver/gene_model.py @@ -16,6 +16,7 @@ from typing import Dict, Optional, List, Any, Mapping from ariadne import QueryType, ObjectType +from ensembl.production.metadata.api.models import Genome from graphql import GraphQLResolveInfo, GraphQLError from pymongo.database import Database, Collection @@ -41,7 +42,7 @@ DatabaseNotFoundError, CollectionNotFoundError, ) - +from grpc_service.grpc_model import GRPC_MODEL logger = logging.getLogger(__name__) @@ -686,7 +687,9 @@ async def resolve_region(_, info: GraphQLResolveInfo, by_name: Dict[str, str]) - return result -def fetch_genome_and_combine(info, grpc_model, by_keyword, key): +def fetch_genome_and_combine( + info: GraphQLResolveInfo, grpc_model: GRPC_MODEL, by_keyword: Dict, key: str +) -> List: """ Fetches genomes by a specific keyword and combines genome data with assembly data if requested. @@ -702,6 +705,7 @@ def fetch_genome_and_combine(info, grpc_model, by_keyword, key): Raises: GenomeNotFoundError: If no genomes are found for the given keyword. """ + # Fetch genomes data from metadata using gRPC result = grpc_model.get_genome_by_specific_keyword( **{key: by_keyword.get(key)}, release_version=by_keyword.get("release_version"), @@ -710,11 +714,12 @@ def fetch_genome_and_combine(info, grpc_model, by_keyword, key): if not genomes: raise GenomeNotFoundError(by_keyword) - # Check if the assembly field is requested in the query requested_fields = [ field.name.value for field in info.field_nodes[0].selection_set.selections ] - is_assembly_prensent = "assembly" in requested_fields + # Check if the assembly and/or dataset fields are requested in the query + is_assembly_present = "assembly" in requested_fields + is_dataset_present = "dataset" in requested_fields combined_results = [] for genome in genomes: @@ -724,15 +729,17 @@ def fetch_genome_and_combine(info, grpc_model, by_keyword, key): assembly_collection = connection_db["assembly"] # logging.debug("assembly_collection.name:", assembly_collection.name) - if not is_assembly_prensent: - # Don't bother getting the assembly data if it's not requested in the query - # TODO: See if this can be improved - combined_results.append(create_genome_response(genome, None)) - else: + assembly_data = None + dataset_data = None + if is_assembly_present: assembly_data = fetch_assembly_data( assembly_collection, genome.assembly.name ) - combined_results.append(create_genome_response(genome, assembly_data)) + if is_dataset_present: + dataset_data = fetch_dataset_data(grpc_model, genome.genome_uuid) + combined_results.append( + create_genome_response(genome, assembly_data, dataset_data) + ) return combined_results @@ -779,11 +786,38 @@ def resolve_genome(_, info: GraphQLResolveInfo, by_genome_uuid: Dict[str, str]) ) if not genome.genome_uuid: raise GenomeNotFoundError(by_genome_uuid) - genomes = create_genome_response(genome) + + # fetch dataset info + dataset_data = fetch_dataset_data(grpc_model, genome.genome_uuid) + genomes = create_genome_response( + genome=genome, assembly_data=None, dataset_data=dataset_data + ) return genomes -def create_genome_response(genome, assembly_data=None): +def create_genome_response( + genome: Genome, + assembly_data: Optional[Dict[str, Any]] = None, + dataset_data: Optional[List] = None, +) -> Dict: + + datasets_response = [] + if dataset_data: + for dataset in dataset_data: + datasets_response.append( + { + "dataset_id": dataset.dataset_uuid, + "name": dataset.dataset_label, + "version": dataset.dataset_version, + "release": dataset.release_version, + "type": dataset.dataset_type_topic, + "source": dataset.dataset_source_type, + "dataset_type": dataset.dataset_type_name, + "release_date": dataset.release_date, + "release_type": dataset.release_type, + } + ) + response = { "genome_id": genome.genome_uuid, "assembly_accession": genome.assembly.accession, @@ -796,6 +830,7 @@ def create_genome_response(genome, assembly_data=None): "genome_tag": genome.assembly.url_name if not None else genome.assembly.tol_id, "is_reference": genome.assembly.is_reference, "assembly": assembly_data, + "dataset": datasets_response, } return response @@ -815,6 +850,12 @@ def fetch_assembly_data(assembly_collection: Collection, assembly_id: str) -> Ma return assembly +def fetch_dataset_data(grpc_model: GRPC_MODEL, genome_uuid: str) -> List: + result = grpc_model.get_datasets_list_by_uuid(genome_uuid) + datasets = list(result.datasets) + return datasets + + def get_version_details() -> Dict[str, str]: """ Fetch version details from a 'version_config.ini' file. diff --git a/grpc_service/grpc_model.py b/grpc_service/grpc_model.py index 9ff3a887..85c56d7e 100644 --- a/grpc_service/grpc_model.py +++ b/grpc_service/grpc_model.py @@ -61,6 +61,18 @@ def get_genome_by_specific_keyword( response = self.grpc_stub.GetGenomesBySpecificKeyword(request) return response + def get_datasets_list_by_uuid(self, genome_uuid, release_version=None): + logger.debug( + "Received RPC for GetDatasetsListByUUID with genome_uuid: '%s', release: %s", + genome_uuid, + release_version, + ) + request = ensembl_metadata_pb2.DatasetsRequest( + genome_uuid=genome_uuid, release_version=release_version + ) + response = self.grpc_stub.GetDatasetsListByUUID(request) + return response + def get_release_by_genome_uuid(self, genome_uuid): request = ensembl_metadata_pb2.ReleaseVersionRequest(genome_uuid=genome_uuid) response = self.grpc_stub.GetReleaseVersionByUUID(request) From 97c15329d375c36ec915aa0199a7316b2365680c Mon Sep 17 00:00:00 2001 From: Bilal Date: Wed, 17 Jul 2024 13:46:44 +0100 Subject: [PATCH 10/14] Improve the code logic --- common/utils.py | 24 ++++ graphql_service/resolver/gene_model.py | 161 ++++++++++++++++--------- 2 files changed, 126 insertions(+), 59 deletions(-) diff --git a/common/utils.py b/common/utils.py index b711e431..f3f93e8c 100644 --- a/common/utils.py +++ b/common/utils.py @@ -13,6 +13,9 @@ """ import logging +from typing import List + +from graphql import GraphQLResolveInfo logger = logging.getLogger(__name__) @@ -69,3 +72,24 @@ def get_ensembl_metadata_api_version(): version = line.strip().split("@")[-1] break return version + + +def check_requested_fields(info: GraphQLResolveInfo, fields: List[str]) -> List[bool]: + """ + Check if specific fields are requested in the GraphQL query. + + Args: + info (ResolveInfo): The GraphQL resolve information containing query details. + fields (List[str]): A list of field names to check for in the query. + + Returns: + List[bool]: A list of booleans indicating whether each field is present in the query. + + Usage example: + fields_to_check = ["assembly", "dataset"] + is_assembly_present, is_dataset_present = check_requested_fields(info, fields_to_check) + """ + requested_fields = [ + field.name.value for field in info.field_nodes[0].selection_set.selections + ] + return [field in requested_fields for field in fields] diff --git a/graphql_service/resolver/gene_model.py b/graphql_service/resolver/gene_model.py index ad272ff9..f4cb5a68 100644 --- a/graphql_service/resolver/gene_model.py +++ b/graphql_service/resolver/gene_model.py @@ -20,6 +20,7 @@ from graphql import GraphQLResolveInfo, GraphQLError from pymongo.database import Database, Collection +from common import utils from graphql_service.resolver.data_loaders import BatchLoaders from graphql_service.resolver.exceptions import ( @@ -687,68 +688,30 @@ async def resolve_region(_, info: GraphQLResolveInfo, by_name: Dict[str, str]) - return result -def fetch_genome_and_combine( - info: GraphQLResolveInfo, grpc_model: GRPC_MODEL, by_keyword: Dict, key: str +@QUERY_TYPE.field("genomes") +def resolve_genomes( + _, info: GraphQLResolveInfo, by_keyword: Dict[str, str] = None ) -> List: """ - Fetches genomes by a specific keyword and combines genome data with assembly data if requested. + Resolve the genomes based on provided keyword arguments. + Under the hood, this resolver might execute and combine 3 different queries based on the requested data: + - The default `get_genome_by_specific_keyword()` gRPC call (Metadata DB) + - If `assembly` is requested, `fetch_assembly_data()` is triggered fetching data from Mongo DB + - If `dataset` is requested, `fetch_dataset_data()` is triggered which triggers `get_datasets_list_by_uuid()` + gRPC call to fetch dataset info (Metadata DB) Args: - info (GraphQLResolveInfo): The GraphQL resolver information containing the field nodes and other query details. - grpc_model: The gRPC model to fetch genome data. - by_keyword (dict): Dictionary containing the keyword to search genomes by. - key (str): The specific key to use for fetching genomes. + info (GraphQLResolveInfo): GraphQL resolve information containing query details. + by_keyword (Dict[str, str]): Dictionary containing keyword arguments for fetching genomes. Returns: - List: A list of combined genome and assembly data objects. If assembly data is not requested, only genome data is included. + List: A list of genomes matching the provided keyword. Raises: - GenomeNotFoundError: If no genomes are found for the given keyword. + MissingArgumentException: If 'by_keyword' argument is not provided. + GraphQLError: If not exactly one field in 'by_keyword' is provided. + GenomeNotFoundError: If no genomes are found matching the provided keyword. """ - # Fetch genomes data from metadata using gRPC - result = grpc_model.get_genome_by_specific_keyword( - **{key: by_keyword.get(key)}, - release_version=by_keyword.get("release_version"), - ) - genomes = list(result) - if not genomes: - raise GenomeNotFoundError(by_keyword) - - requested_fields = [ - field.name.value for field in info.field_nodes[0].selection_set.selections - ] - # Check if the assembly and/or dataset fields are requested in the query - is_assembly_present = "assembly" in requested_fields - is_dataset_present = "dataset" in requested_fields - - combined_results = [] - for genome in genomes: - set_db_conn_for_uuid(info, genome.genome_uuid) - connection_db = get_db_conn(info) - # logging.debug("Collections in the database:", connection_db.list_collection_names()) - assembly_collection = connection_db["assembly"] - # logging.debug("assembly_collection.name:", assembly_collection.name) - - assembly_data = None - dataset_data = None - if is_assembly_present: - assembly_data = fetch_assembly_data( - assembly_collection, genome.assembly.name - ) - if is_dataset_present: - dataset_data = fetch_dataset_data(grpc_model, genome.genome_uuid) - combined_results.append( - create_genome_response(genome, assembly_data, dataset_data) - ) - return combined_results - - -@QUERY_TYPE.field("genomes") -def resolve_genomes( - _, info: GraphQLResolveInfo, by_keyword: Optional[Dict[str, str]] = None -) -> List: - - # ask them to provide at least one argument if not by_keyword: raise MissingArgumentException("You must provide 'by_keyword' argument.") @@ -770,8 +733,48 @@ def resolve_genomes( "scientific_parlance_name", "species_taxonomy_id", ]: + # if one of the keys is provided if by_keyword.get(key): - return fetch_genome_and_combine(info, grpc_model, by_keyword, key) + # Fetch genomes data from metadata using gRPC + result = grpc_model.get_genome_by_specific_keyword( + **{key: by_keyword.get(key)}, + release_version=by_keyword.get("release_version"), + ) + genomes = list(result) + + if not genomes: + raise GenomeNotFoundError(by_keyword) + + # Check if the assembly and dataset fields are requested in the query + fields_to_check = ["assembly", "dataset"] + is_assembly_present, is_dataset_present = utils.check_requested_fields( + info, fields_to_check + ) + + combined_results = [] + for genome in genomes: + set_db_conn_for_uuid(info, genome.genome_uuid) + connection_db = get_db_conn(info) + # logging.debug("Collections in the database:", connection_db.list_collection_names()) + assembly_collection = connection_db["assembly"] + # logging.debug("assembly_collection.name:", assembly_collection.name) + + assembly_data = ( + fetch_assembly_data(assembly_collection, genome.assembly.name) + if is_assembly_present + else None + ) + dataset_data = ( + fetch_dataset_data(grpc_model, genome.genome_uuid) + if is_dataset_present + else None + ) + + combined_results.append( + create_genome_response(genome, dataset_data, assembly_data) + ) + + return combined_results return [] @@ -787,20 +790,36 @@ def resolve_genome(_, info: GraphQLResolveInfo, by_genome_uuid: Dict[str, str]) if not genome.genome_uuid: raise GenomeNotFoundError(by_genome_uuid) - # fetch dataset info - dataset_data = fetch_dataset_data(grpc_model, genome.genome_uuid) - genomes = create_genome_response( - genome=genome, assembly_data=None, dataset_data=dataset_data + # Check if the dataset fields is requested in the query + fields_to_check = ["dataset"] + is_dataset_present = utils.check_requested_fields(info, fields_to_check) + + dataset_data = ( + fetch_dataset_data(grpc_model, genome.genome_uuid) + if is_dataset_present + else None ) + + genomes = create_genome_response(genome=genome, dataset_data=dataset_data) return genomes def create_genome_response( genome: Genome, - assembly_data: Optional[Dict[str, Any]] = None, dataset_data: Optional[List] = None, + assembly_data: Optional[Dict[str, Any]] = None, ) -> Dict: + """ + Create a response dictionary for a genome with optional assembly and dataset data. + + Args: + genome (Genome): The genome object containing genome-related information. + assembly_data (Optional[Dict[str, Any]]): Optional dictionary containing assembly data. + dataset_data (Optional[List]): Optional list of dataset objects containing dataset information. + Returns: + Dict: A dictionary containing the genome response data. + """ datasets_response = [] if dataset_data: for dataset in dataset_data: @@ -836,6 +855,20 @@ def create_genome_response( def fetch_assembly_data(assembly_collection: Collection, assembly_id: str) -> Mapping: + """ + Fetch assembly data from a collection using the assembly ID. + + Args: + assembly_collection (Collection): The collection to search for the assembly data. + assembly_id (str): The ID of the assembly to fetch. + + Returns: + Mapping: The assembly data if found. + + Raises: + CollectionNotFoundError: If there is an issue accessing the collection. + AssemblyNotFoundError: If the assembly with the given ID is not found. + """ query = {"assembly_id": assembly_id} try: assembly = assembly_collection.find_one(query) @@ -851,6 +884,16 @@ def fetch_assembly_data(assembly_collection: Collection, assembly_id: str) -> Ma def fetch_dataset_data(grpc_model: GRPC_MODEL, genome_uuid: str) -> List: + """ + Fetch dataset data using a gRPC model based on the genome UUID. + + Args: + grpc_model (GRPC_MODEL): The gRPC model to use for fetching the dataset data. + genome_uuid (str): The UUID of the genome for which to fetch dataset data. + + Returns: + List: A list of datasets associated with the given genome UUID. + """ result = grpc_model.get_datasets_list_by_uuid(genome_uuid) datasets = list(result.datasets) return datasets From b9425726fa77e9880199c4bc6744f2371de84a96 Mon Sep 17 00:00:00 2001 From: Bilal Date: Thu, 18 Jul 2024 09:34:15 +0100 Subject: [PATCH 11/14] Made symbol param optional in genes and transcript resolvers (EA-1228) --- common/schemas/query.graphql | 2 +- graphql_service/resolver/gene_model.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/common/schemas/query.graphql b/common/schemas/query.graphql index cf58bfcf..dcefe71c 100644 --- a/common/schemas/query.graphql +++ b/common/schemas/query.graphql @@ -26,7 +26,7 @@ type Query { } input SymbolInput { - symbol: String! + symbol: String genome_id: String! } diff --git a/graphql_service/resolver/gene_model.py b/graphql_service/resolver/gene_model.py index f4cb5a68..ad7cf025 100644 --- a/graphql_service/resolver/gene_model.py +++ b/graphql_service/resolver/gene_model.py @@ -113,12 +113,16 @@ def resolve_gene( @QUERY_TYPE.field("genes") def resolve_genes(_, info: GraphQLResolveInfo, by_symbol: Dict[str, str]) -> List: - "Load Genes via potentially ambiguous symbol" + """ + Load Genes via potentially ambiguous symbol + Or + If no Symbol is specified, get all related genes (this feature might be removed later) + """ query = { "genome_id": by_symbol["genome_id"], "type": "Gene", - "symbol": by_symbol["symbol"], + "symbol": by_symbol.get("symbol"), # this makes symbol optional } set_db_conn_for_uuid(info, by_symbol["genome_id"]) @@ -227,7 +231,7 @@ def resolve_transcript( query: Dict[str, Any] = {"type": "Transcript"} genome_id = None if by_symbol: - query["symbol"] = by_symbol["symbol"] + query["symbol"] = by_symbol.get("symbol") # this makes symbol optional query["genome_id"] = by_symbol["genome_id"] genome_id = by_symbol["genome_id"] if by_id: From bc49c44b1947ca85ca0192f7a01806cbb3908d86 Mon Sep 17 00:00:00 2001 From: Bilal Date: Fri, 19 Jul 2024 15:53:16 +0100 Subject: [PATCH 12/14] fix mypy errors --- common/utils.py | 15 ++++++++++----- graphql_service/resolver/gene_model.py | 4 ++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/common/utils.py b/common/utils.py index f3f93e8c..70b28603 100644 --- a/common/utils.py +++ b/common/utils.py @@ -15,7 +15,7 @@ import logging from typing import List -from graphql import GraphQLResolveInfo +from graphql import GraphQLResolveInfo, FieldNode logger = logging.getLogger(__name__) @@ -79,7 +79,7 @@ def check_requested_fields(info: GraphQLResolveInfo, fields: List[str]) -> List[ Check if specific fields are requested in the GraphQL query. Args: - info (ResolveInfo): The GraphQL resolve information containing query details. + info (GraphQLResolveInfo): The GraphQL resolve information containing query details. fields (List[str]): A list of field names to check for in the query. Returns: @@ -89,7 +89,12 @@ def check_requested_fields(info: GraphQLResolveInfo, fields: List[str]) -> List[ fields_to_check = ["assembly", "dataset"] is_assembly_present, is_dataset_present = check_requested_fields(info, fields_to_check) """ - requested_fields = [ - field.name.value for field in info.field_nodes[0].selection_set.selections - ] + requested_fields = [] + if info.field_nodes: + selection_set = info.field_nodes[0].selection_set + if selection_set and selection_set.selections: + for field in selection_set.selections: + if isinstance(field, FieldNode) and field.name and field.name.value: + requested_fields.append(field.name.value) + return [field in requested_fields for field in fields] diff --git a/graphql_service/resolver/gene_model.py b/graphql_service/resolver/gene_model.py index 496c846b..829793fd 100644 --- a/graphql_service/resolver/gene_model.py +++ b/graphql_service/resolver/gene_model.py @@ -538,7 +538,7 @@ def resolve_product_by_id( @PRODUCT_TYPE.field("product_generating_context") -def resolve_pgc_for_product(product: Dict, info: GraphQLResolveInfo) -> Dict: +def resolve_pgc_for_product(product: Dict, info: GraphQLResolveInfo) -> Optional[Dict]: pipeline = [ { "$match": { @@ -856,7 +856,7 @@ def resolve_genome(_, info: GraphQLResolveInfo, by_genome_uuid: Dict[str, str]) def create_genome_response( genome: Genome, dataset_data: Optional[List] = None, - assembly_data: Optional[Dict[str, Any]] = None, + assembly_data: Optional[Mapping[Any, Any]] = None, ) -> Dict: """ Create a response dictionary for a genome with optional assembly and dataset data. From 04760e4a7a453cca3a8e3d5fba0b59cef5ed81fe Mon Sep 17 00:00:00 2001 From: Bilal Date: Fri, 19 Jul 2024 15:54:22 +0100 Subject: [PATCH 13/14] add mypy to github actions --- .github/workflows/tests.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f21e7573..375e8e84 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,5 +1,5 @@ # Workflow name -name: PyTest, Black and Pylint +name: PyTest, Black, Pylint and Mypy # Controls when the workflow will run on: @@ -17,7 +17,7 @@ jobs: # This workflow contains a single job called "tests" tests: # The type of runner that the job will run on and timeout in minutes - name: Run Python Tests, Black formatter and Pylint + name: Run Python Tests, Black formatter, Pylint and Mypy runs-on: ubuntu-latest timeout-minutes: 10 @@ -32,7 +32,7 @@ jobs: - name: Check out repository code uses: actions/checkout@v3 - # Set up Python version from the matrix + # Set up Python version from the matrix - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: @@ -59,3 +59,8 @@ jobs: - name: Run Pylint run: | pylint $(git ls-files '*.py') --fail-under=9.5 + + # Run Mypy + - name: Run Mypy type checker + run: | + mypy graphql_service From 1274e8cac8368001e4b7f3592fb5c7f3808322dd Mon Sep 17 00:00:00 2001 From: Bilal Date: Mon, 22 Jul 2024 09:30:38 +0100 Subject: [PATCH 14/14] update metadata-api tag --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e940b295..30c06c5c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,6 @@ aiodataloader==0.2.1 ariadne==0.19.1 python-dotenv==0.19.2 uvicorn==0.18.1 -ensembl-metadata-api@git+https://github.com/Ensembl/ensembl-metadata-api.git@2.1.0a3 +ensembl-metadata-api@git+https://github.com/Ensembl/ensembl-metadata-api.git@2.2.0a1 grpcio==1.62.0 grpcio-tools==1.62.0