Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add region input queries #97

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions common/schemas/query.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ type Query {
start: Int @deprecated(reason: "Use `by_slice`"),
end: Int @deprecated(reason: "Use `by_slice`")
by_slice: SliceInput): Locus
region(by_name: RegionNameInput!): Region
regions(by_genome_id: GenomeIdInput!): [Region]
Copy link
Collaborator

@azangru azangru Oct 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I realised we are missing from the CDM — if we allow queries like this — is a field with some kind of ranking information in Region.

Consider an example in which you are requesting all human chromosomes. You will likely want to display them in a specific order. There is no way for the client to know, for example, that human chromosome X goes after human chromosome 22, unless someone has manually curated the list of top-level regions, and the backend tells the client this order.

cc: @bethflint

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that a CDM thing or a thing we want to support in our GraphQL service which is not part of CDM? And where does this data currently reside?

Copy link
Collaborator

@azangru azangru Oct 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like a CDM thing to me. One needs to somehow represent the information that the first chromosome of Saccharomyces cerevisiae is chromosome I, or that human karyotype ends in chromosomes 22, X, Y, MT, in that order.

And where does this data currently reside?

🤷‍♂️

do you know @ens-st3 ?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MySQL [homo_sapiens_core_108_38]> select sr.name, sra.value from attrib_type at, seq_region_attrib sra, seq_region sr where at.attrib_type_id = sra.attrib_type_id and sra.seq_region_id = sr.seq_region_id and at.code = 'karyotype_rank' order by sra.value;
+------+-------+
| name | value |
+------+-------+
| 1 | 1 |
| 10 | 10 |
| 11 | 11 |
| 12 | 12 |
| 13 | 13 |
| 14 | 14 |
| 15 | 15 |
| 16 | 16 |
| 17 | 17 |
| 18 | 18 |
| 19 | 19 |
| 2 | 2 |
| 20 | 20 |
| 21 | 21 |
| 22 | 22 |
| X | 23 |
| Y | 24 |
| MT | 25 |
| 3 | 3 |
| 4 | 4 |
| 5 | 5 |
| 6 | 6 |
| 7 | 7 |
| 8 | 8 |
| 9 | 9 |
+------+-------+
25 rows in set (0.003 sec)

}

input SymbolInput {
Expand All @@ -39,4 +41,13 @@ type Locus {
# A dynamically defined part of a Region
genes: [Gene!]!
transcripts: [Transcript!]!
}

input RegionNameInput {
genome_id: String!
name: String!
}

input GenomeIdInput {
genome_id: String!
}
20 changes: 18 additions & 2 deletions graphql_service/resolver/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,29 @@ def __init__(self, product_id: str, genome_id: str):
super().__init__("product", {"product_id": product_id, "genome_id": genome_id})


class RegionFromSliceNotFoundError(FieldNotFoundError):
"""Custom error to be raised if we can't find the region for a slice"""

def __init__(self, region_id: str):
super().__init__("region", {"region_id": region_id})


class RegionNotFoundError(FieldNotFoundError):
"""
Custom error to be raised if region is not found
"""

def __init__(self, region_id: str):
super().__init__("region", {"region_id": region_id})
def __init__(self, genome_id: str, name: str):
super().__init__("region", {"genome_id": genome_id, "name": name})


class RegionsNotFoundError(FieldNotFoundError):
"""
Custom error to be raised if regions are not found
"""

def __init__(self, genome_id: str):
super().__init__("regions", {"genome_id": genome_id})


class AssemblyNotFoundError(FieldNotFoundError):
Expand Down
38 changes: 36 additions & 2 deletions graphql_service/resolver/gene_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
AssembliesFromOrganismNotFound,
SpeciesFromOrganismNotFound,
OrganismsFromSpeciesNotFound,
RegionsNotFoundError,
RegionFromSliceNotFoundError,
)

# Define Query types for GraphQL
Expand Down Expand Up @@ -372,7 +374,9 @@ async def resolve_product_by_pgc(pgc: Dict, info: GraphQLResolveInfo) -> Optiona


@SLICE_TYPE.field("region")
async def resolve_region(slc: Dict, info: GraphQLResolveInfo) -> Optional[Dict]:
async def resolve_region_from_slice(
slc: Dict, info: GraphQLResolveInfo
) -> Optional[Dict]:
"Fetch a region that is referenced by a slice"
if slc["region_id"] is None:
return None
Expand All @@ -383,7 +387,7 @@ async def resolve_region(slc: Dict, info: GraphQLResolveInfo) -> Optional[Dict]:
regions = await loader.load(key=region_id)

if not regions:
raise RegionNotFoundError(region_id)
raise RegionFromSliceNotFoundError(region_id=region_id)
return regions[0]


Expand Down Expand Up @@ -465,3 +469,33 @@ async def resolve_organisms_from_species(
if not organisms:
raise OrganismsFromSpeciesNotFound(species_id=species["species_primary_key"])
return organisms


@QUERY_TYPE.field("region")
async def resolve_region(_, info: GraphQLResolveInfo, by_name: Dict[str, str]) -> Dict:
query = {
"type": "Region",
"genome_id": by_name["genome_id"],
"name": by_name["name"],
}
collection = info.context["mongo_db"]
result = collection.find_one(query)
if not result:
raise RegionNotFoundError(genome_id=by_name["genome_id"], name=by_name["name"])
return result


@QUERY_TYPE.field("regions")
async def resolve_regions(
_, info: GraphQLResolveInfo, by_genome_id: Dict[str, str]
) -> List[Dict]:
query = {
"type": "Region",
"genome_id": by_genome_id["genome_id"],
"code": "chromosome", # We filter on chromosomes to avoid returning an overwhelming number of regions
Copy link
Collaborator

@azangru azangru Oct 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, do you have an overwhelming number of regions? Then this is likely another of our "long lists of things" cases, and we should give the client an option to request different kinds of regions. Possibly making it an optional query parameter, in the absence of which Thoas will respond with top-level regions.

By the way, not all organisms in our databases will have top-level regions that are chromosomes. @ens-st3 has chosen an organism (Chacoan peccary, I believe) whose genome has only been sequenced to a very provisional level (primary assemblies, is it?) for inclusion in Thoas as part of the next batch of species, to confirm that we are able to handle organisms like that. The code above won't be able to find top-level regions for such organisms.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was because it was mentioned that there would potentially be millions of regions for some assemblies. I think a decision needs to be made about how we handle pagination before we can handle those very large numbers.

Do we know how many non-chromosomal regions the assemblies we know we're going to be handling in the short and mid term are going to have?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's an attribute in the "attrib_type" table called "Top level" with description "Top Level Non-Redundant Sequence Region" which looks like it could be a better candidate to filter on than "chromosome"

Copy link

@ens-st3 ens-st3 Oct 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MySQL [catagonus_wagneri_core_108_2]> select count() from seq_region sr, seq_region_attrib sra, attrib_type at where sr.seq_region_id = sra.seq_region_id and sra.attrib_type_id = at.attrib_type_id and at.code = 'toplevel';
+----------+
| count(
) |
+----------+
| 1183730 |
+----------+

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note I've added it to sta-6 just in case we choose to support these now. If there are similar genomes in rapid, which is prob unlikely then we will need to, but if not it's our choice.
Another 'interesting' example is very long top level sequences, eg
MySQL [monodelphis_domestica_core_108_1]> select max(sr.length) from seq_region sr, seq_region_attrib sra, attrib_type at where sr.seq_region_id = sra.seq_region_id and sra.attrib_type_id = at.attrib_type_id and at.code = 'toplevel';
+----------------+
| max(sr.length) |
+----------------+
| 748055161 |
+----------------+

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Steve's first query shows that filtering on "toplevel" wouldn't product us from getting back an overwhelming number of regions, right?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It'll exclude the constituent parts of assemblies (contigs, clones etc) which will help massively. There should never be any annotation on these - they should only ever be on toplevel regions - and I suggest there's no reason for them to be in Thoas.
Is 1.2M toplevel sequences overwhelming ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if 1.2M would be overwhelming. Users can already overwhelm the service if they want to by sending very deep queries. I agree with @bethflint , I think we should make a decision on pagination. That would give us a way to limit queries, and then we won't have to worry about them being too heavy.

}
collection = info.context["mongo_db"]
results = list(collection.find(query))
if not results:
raise RegionsNotFoundError(genome_id=by_genome_id["genome_id"])
return results
86 changes: 83 additions & 3 deletions graphql_service/resolver/tests/test_resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,15 @@ def fixture_region_data():
"type": "Region",
"region_id": "plasmodium_falciparum_GCA_000002765_2_13",
"name": "13",
"genome_id": "plasmodium_falciparum_GCA_000002765_2",
"code": "chromosome",
},
{
"type": "Region",
"region_id": "plasmodium_falciparum_GCA_000002765_2_14",
"name": "14",
"genome_id": "plasmodium_falciparum_GCA_000002765_2",
"code": "chromosome",
},
]
)
Expand Down Expand Up @@ -475,7 +479,7 @@ async def test_resolve_region_happy_case(region_data):
"default": True,
}
info = create_info(region_data)
result = await model.resolve_region(slc, info)
result = await model.resolve_region_from_slice(slc, info)
assert result["region_id"] == "plasmodium_falciparum_GCA_000002765_2_13"


Expand All @@ -486,8 +490,8 @@ async def test_resolve_region_region_not_exist(region_data):
"region_id": "some_non_existing_region_id",
}
result = None
with pytest.raises(model.RegionNotFoundError) as region_error:
result = await model.resolve_region(slc, info)
with pytest.raises(model.RegionFromSliceNotFoundError) as region_error:
result = await model.resolve_region_from_slice(slc, info)
assert not result
assert region_error.value.extensions["region_id"] == "some_non_existing_region_id"

Expand Down Expand Up @@ -811,6 +815,82 @@ async def test_resolve_organisms_from_species_not_exists(genome_data):
assert organisms_not_found_error.value.extensions["species_id"] == "blah blah"


@pytest.mark.asyncio
async def test_resolve_region(region_data):
info = create_info(region_data)

result = await model.resolve_region(
None,
info,
by_name={"genome_id": "plasmodium_falciparum_GCA_000002765_2", "name": "14"},
)
assert remove_ids(result) == {
"genome_id": "plasmodium_falciparum_GCA_000002765_2",
"name": "14",
"region_id": "plasmodium_falciparum_GCA_000002765_2_14",
"code": "chromosome",
"type": "Region",
}


@pytest.mark.asyncio
async def test_resolve_region_no_results(region_data):
info = create_info(region_data)

result = None
with pytest.raises(model.RegionNotFoundError) as region_not_found_error:
result = await model.resolve_region(
None, info, by_name={"genome_id": "yeti_genome", "name": "14"}
)
assert not result
assert region_not_found_error.value.extensions == {
"code": "REGION_NOT_FOUND",
"genome_id": "yeti_genome",
"name": "14",
}


@pytest.mark.asyncio
async def test_resolve_regions(region_data):
info = create_info(region_data)

result = await model.resolve_regions(
None, info, by_genome_id={"genome_id": "plasmodium_falciparum_GCA_000002765_2"}
)
assert remove_ids(result) == [
{
"code": "chromosome",
"genome_id": "plasmodium_falciparum_GCA_000002765_2",
"name": "13",
"region_id": "plasmodium_falciparum_GCA_000002765_2_13",
"type": "Region",
},
{
"code": "chromosome",
"genome_id": "plasmodium_falciparum_GCA_000002765_2",
"name": "14",
"region_id": "plasmodium_falciparum_GCA_000002765_2_14",
"type": "Region",
},
]


@pytest.mark.asyncio
async def test_resolve_regions_no_result(region_data):
info = create_info(region_data)

result = None
with pytest.raises(model.RegionsNotFoundError) as regions_not_found_error:
result = await model.resolve_regions(
None, info, by_genome_id={"genome_id": "yeti_genome"}
)
assert not result
assert regions_not_found_error.value.extensions == {
"code": "REGIONS_NOT_FOUND",
"genome_id": "yeti_genome",
}


def remove_ids(test_output):
if isinstance(test_output, dict):
del test_output["_id"]
Expand Down
1 change: 1 addition & 0 deletions graphql_service/tests/fixtures/human_brca2.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ def build_region():
return {
"type": "Region",
"region_id": "homo_sapiens_GCA_000001405_28_13_chromosome",
"genome_id": "homo_sapiens_GCA_000001405_28",
"name": "13",
"length": 114364328,
"code": "chromosome",
Expand Down
16 changes: 16 additions & 0 deletions graphql_service/tests/snapshots/snap_test_region_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# -*- coding: utf-8 -*-
# snapshottest: v1 - https://goo.gl/zC4yUc
from __future__ import unicode_literals

from snapshottest import Snapshot


snapshots = Snapshot()

snapshots["test_region_retrieval_by_name 1"] = {
"region": {"code": "chromosome", "length": 114364328, "name": "13"}
}

snapshots["test_regions_retrieval_by_genome_id 1"] = {
"regions": [{"code": "chromosome", "length": 114364328, "name": "13"}]
}
56 changes: 56 additions & 0 deletions graphql_service/tests/test_region_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""
See the NOTICE file distributed with this work for additional information
regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import pytest
from ariadne import graphql

from .snapshot_utils import setup_test, add_loaders_to_context

executable_schema, context = setup_test()


@pytest.mark.asyncio
async def test_region_retrieval_by_name(snapshot):
query = """{
region (by_name: {genome_id: "homo_sapiens_GCA_000001405_28", name: "13"}) {
name
code
length
}
}"""

query_data = {"query": query}
(success, result) = await graphql(
executable_schema, query_data, context_value=add_loaders_to_context(context)
)
assert success
snapshot.assert_match(result["data"])


@pytest.mark.asyncio
async def test_regions_retrieval_by_genome_id(snapshot):
query = """{
regions (by_genome_id: {genome_id: "homo_sapiens_GCA_000001405_28"}) {
name
code
length
}
}
"""
query_data = {"query": query}
(success, result) = await graphql(
executable_schema, query_data, context_value=add_loaders_to_context(context)
)
assert success
snapshot.assert_match(result["data"])