diff --git a/.gitignore b/.gitignore index ef0830c6..89405359 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ vetur.config.js *.sql **.local** build/** +.idea/ +node_modules/ \ No newline at end of file diff --git a/nmdc_server/api.py b/nmdc_server/api.py index 4bc28fa2..7b6b7ad4 100644 --- a/nmdc_server/api.py +++ b/nmdc_server/api.py @@ -1,9 +1,14 @@ +from collections import Counter +from datetime import date from io import BytesIO from typing import Any, Dict, List, Optional from uuid import UUID +import bson.json_util from fastapi import APIRouter, Depends, Header, HTTPException, Response from fastapi.responses import JSONResponse +import json +from pymongo import MongoClient, DESCENDING from sqlalchemy.orm import Session from starlette.requests import Request from starlette.responses import RedirectResponse, StreamingResponse @@ -23,9 +28,13 @@ from nmdc_server.ingest.envo import nested_envo_trees from nmdc_server.models import IngestLock, SubmissionMetadata, User from nmdc_server.pagination import Pagination +from nmdc_server.query import Operation router = APIRouter() +BIOSAMPLE_SEARCH_COLLECTION = "biosample_denormalized" +OMICS_PROCESSING_SEARCH_COLLECTION = "omics_processing_denormalized" + # get application settings @router.get("/settings", name="Get application settings") @@ -62,6 +71,81 @@ async def get_database_summary(db: Session = Depends(get_db)): return crud.get_database_summary(db) +@router.get( + "/mongo_summary", + tags=["aggregation"], +) +async def mongo_get_database_summary(db: Session = Depends(get_db)): + return { + "biosample": { + "attributes": { + "depth.has_numeric_value": { + "min": 0, + "max": 2000, + "type": "float", + }, + "geo_loc_name.has_raw_value": { + "type": "string", + }, + "gold_classification": { + "type": "sankey-tree", + }, + "env_broad_scale.term.id": { + "type": "tree", + }, + "env_local_scale.term.id": { + "type": "tree", + }, + "env_medium.term.id": { + "type": "tree", + }, + "lat_lon.latitude": { + "type": "float", + "min": -90, + "max": 90, + }, + "lat_lon.longitude": { + "type": "float", + "min": -180, + "max": 180, + }, + "collection_date.has_date_value": { + "type": "date", + "min": "2000-03-15T00:00:00", + "max": "2022-08-12T00:00:00", + }, + }, + }, + "gene_function": { + "attributes": { + "id": { + "type": "kegg_search", + }, + }, + }, + "omics_processing": { + "attributes": { + "omics_type.has_raw_value": { + "type": "string", + }, + "instrument_name": { + "type": "string", + }, + "processing_institution": { + "type": "string", + }, + }, + }, + "study": { + "attributes": { + "principal_investigator.has_raw_value": { + "type": "string", + }, + }, + }, + } + + @router.get( "/stats", response_model=schemas.AggregationSummary, @@ -83,6 +167,55 @@ async def get_environmental_sankey( return crud.get_environmental_sankey(db, query) +@router.post( + "/environment/mongo_sankey", + tags=["aggregation"], +) +async def mongo_get_environmental_sankey( + query: query.BiosampleQuerySchema = query.BiosampleQuerySchema(), + db: Session = Depends(get_db), +): + settings = Settings() + client = MongoClient( + host=settings.mongo_host, + username=settings.mongo_user, + password=settings.mongo_password, + port=settings.mongo_port, + directConnection=True, + ) + mongo_filter = conditions_to_mongo_filter(query.conditions) + results = client.nmdc[BIOSAMPLE_SEARCH_COLLECTION].aggregate([ + { + "$match": mongo_filter, + }, + { + "$group": { + "_id": { + "ecosystem": "$ecosystem", + "ecosystem_category": "$ecosystem_category", + "ecosystem_subtype": "$ecosystem_subtype", + "ecosystem_type": "$ecosystem_type", + "specific_ecosystem": "$specific_ecosystem", + }, + "count": { + "$count": {}, + }, + }, + }, + { + "$set": { + "_id.count": "$count", + }, + }, + { + "$replaceRoot": { + "newRoot": "$_id", + }, + }, + ]) + return json.loads(bson.json_util.dumps(results)) + + @router.post( "/environment/geospatial", response_model=List[schemas.EnvironmentGeospatialAggregation], @@ -94,6 +227,87 @@ async def get_environmental_geospatial( return crud.get_environmental_geospatial(db, query) +@router.post( + "/environment/mongo_geospatial", + tags=["aggregation"], +) +async def mongo_get_environmental_geospatial( + query: query.BiosampleQuerySchema = query.BiosampleQuerySchema() +): + settings = Settings() + client = MongoClient( + host=settings.mongo_host, + username=settings.mongo_user, + password=settings.mongo_password, + port=settings.mongo_port, + directConnection=True, + ) + mongo_filter = conditions_to_mongo_filter(query.conditions) + results = client.nmdc[BIOSAMPLE_SEARCH_COLLECTION].aggregate([ + { + "$match": mongo_filter, + }, + { + "$group": { + "_id": { + "latitude": "$lat_lon.latitude", + "longitude": "$lat_lon.longitude", + "ecosystem": "$ecosystem", + "ecosystem_category": "$ecosystem_category", + }, + "count": { + "$count": {}, + }, + }, + }, + { + "$set": { + "_id.count": "$count", + }, + }, + { + "$replaceRoot": { + "newRoot": "$_id", + }, + }, + ]) + return json.loads(bson.json_util.dumps(results)) + + +def facet_value_to_key(value): + if type(value) is list: + return ";".join(value) + return value + + +def conditions_to_mongo_filter(conditions, base_type="biosample"): + mongo_filter = dict() + for condition in conditions: + if condition.table.name == base_type: + field_name = condition.field + else: + field_name = f"{condition.table.name}.{condition.field}" + + if condition.op == Operation.equal: + if not mongo_filter.get(field_name): + mongo_filter[field_name] = {"$in": []} + mongo_filter[field_name]["$in"].append(condition.value) + elif condition.op == Operation.less: + mongo_filter[field_name] = {"$lt": condition.value} + elif condition.op == Operation.less_equal: + mongo_filter[field_name] = {"$lte": condition.value} + elif condition.op == Operation.greater: + mongo_filter[field_name] = {"$gt": condition.value} + elif condition.op == Operation.greater_equal: + mongo_filter[field_name] = {"$gte": condition.value} + elif condition.op == "between": + mongo_filter[field_name] = {"$gte": condition.value[0], "$lte": condition.value[1]} + elif condition.op == "has": + mongo_filter[field_name] = {"$all": condition.value} + + return mongo_filter + + # biosample @router.post( "/biosample/search", @@ -129,6 +343,57 @@ def insert_selected(biosample: schemas.Biosample) -> schemas.Biosample: ) +@router.post( + "/biosample/mongo_search", + tags=["biosample"], + name="Search for biosamples", + description="Faceted search of biosample data.", +) +async def mongo_search_biosample( + query: query.BiosampleSearchQuery = query.BiosampleSearchQuery(), + pagination: Pagination = Depends(), +): + settings = Settings() + client = MongoClient( + host=settings.mongo_host, + username=settings.mongo_user, + password=settings.mongo_password, + port=settings.mongo_port, + directConnection=True, + ) + mongo_filter = conditions_to_mongo_filter(query.conditions) + aggregation = [ + { + "$match": mongo_filter, + }, + { + "$unset": "gene_function", + }, + { + "$sort": {"multiomics_count": -1}, + }, + ] + print("mongo_search_biosample:") + print(aggregation) + + def data_object_is_selected(data_object): + for condition in query.data_object_filter: + if (data_object.get("data_object_type") == condition.file_type and + data_object.get("activity_type") == condition.workflow.value): + return True + return False + + def add_data_object_selection(sample): + for data_object in sample["data_object"]: + data_object["selected"] = data_object_is_selected(data_object) + return sample + + return json.loads(bson.json_util.dumps({ + "count": list(client.nmdc[BIOSAMPLE_SEARCH_COLLECTION].aggregate([*aggregation, {"$count": "count"}]))[0]["count"], + "results": [add_data_object_selection(doc) for doc in client.nmdc[BIOSAMPLE_SEARCH_COLLECTION].aggregate([*aggregation, {"$skip": pagination.offset}, {"$limit": pagination.limit}])], + })) + + @router.post( "/biosample/facet", response_model=query.FacetResponse, @@ -139,6 +404,37 @@ async def facet_biosample(query: query.FacetQuery, db: Session = Depends(get_db) return crud.facet_biosample(db, query.attribute, query.conditions) +@router.post( + "/biosample/mongo_facet", + tags=["biosample"], + name="Get all values of an attribute", +) +async def mongo_facet_biosample(query: query.FacetQuery): + settings = Settings() + client = MongoClient( + host=settings.mongo_host, + username=settings.mongo_user, + password=settings.mongo_password, + port=settings.mongo_port, + directConnection=True, + ) + aggregation = [ + { + "$match": conditions_to_mongo_filter(query.conditions), + }, + { + "$sortByCount": f"${query.attribute}", + }, + ] + + print("mongo_facet_biosample:") + print(aggregation) + + return json.loads(bson.json_util.dumps({ + "facets": { facet_value_to_key(facet["_id"]): facet["count"] for facet in client.nmdc[BIOSAMPLE_SEARCH_COLLECTION].aggregate(aggregation) }, + })) + + @router.post( "/biosample/binned_facet", response_model=query.BinnedFacetResponse, @@ -149,6 +445,61 @@ async def binned_facet_biosample(query: query.BinnedFacetQuery, db: Session = De return crud.binned_facet_biosample(db, **query.dict()) +@router.post( + "/biosample/mongo_binned_facet", + tags=["biosample"], + name="Get all values of an attribute", +) +async def mongo_binned_facet_biosample(query: query.FacetQuery): + settings = Settings() + client = MongoClient( + host=settings.mongo_host, + username=settings.mongo_user, + password=settings.mongo_password, + port=settings.mongo_port, + directConnection=True, + ) + aggregation = [ + { + "$match": conditions_to_mongo_filter(query.conditions), + }, + { + "$group": { + "_id": { "year": { "$year": "$collection_date.has_date_value" }, "month": { "$month": "$collection_date.has_date_value" } }, + "count": { "$count": {} } + }, + }, + ] + print("mongo_binned_facet_biosample:") + print(aggregation) + + date_string = lambda d: f"{d['_id']['year']}-{str(d['_id']['month']).zfill(2)}-01" + binned_data = list(client.nmdc[BIOSAMPLE_SEARCH_COLLECTION].aggregate(aggregation)) + binned_data.sort(key=date_string) + binned_data = [d for d in binned_data if d["_id"]["year"] is not None] + + # Fill in missing months with zero counts + def next_month(d): + if d["month"] < 12: + return {"month": d["month"] + 1, "year": d["year"]} + return {"month": 1, "year": d["year"] + 1} + full_binned_data = [] + for d in binned_data: + if len(full_binned_data) == 0: + full_binned_data.append(d) + continue + while full_binned_data[-1]["_id"]["year"] != d["_id"]["year"] or full_binned_data[-1]["_id"]["month"] != d["_id"]["month"]: + full_binned_data.append({"_id": next_month(full_binned_data[-1]["_id"]), "count": 0}) + full_binned_data[-1] = d + + # Add one more month with zero count so we have a bin end boundary for the last bin + full_binned_data.append({"_id": next_month(full_binned_data[-1]["_id"]), "count": 0}) + return json.loads(bson.json_util.dumps({ + "bins": [date_string(d) for d in full_binned_data], + "facets": [d["count"] for d in full_binned_data][:-1], + })) + + @router.get( "/biosample/{biosample_id}", response_model=schemas.Biosample, @@ -161,6 +512,27 @@ async def get_biosample(biosample_id: str, db: Session = Depends(get_db)): return db_biosample +@router.get( + "/mongo_biosample/{biosample_id}", + tags=["biosample"], +) +async def mongo_get_biosample(biosample_id: str, db: Session = Depends(get_db)): + settings = Settings() + client = MongoClient( + host=settings.mongo_host, + username=settings.mongo_user, + password=settings.mongo_password, + port=settings.mongo_port, + directConnection=True, + ) + + biosamples = list(client.nmdc[BIOSAMPLE_SEARCH_COLLECTION].find({"id": biosample_id})) + if len(biosamples) == 0: + raise HTTPException(status_code=404, detail="Biosample not found") + + return json.loads(bson.json_util.dumps(biosamples[0])) + + @router.get( "/envo/tree", response_model=schemas.EnvoTreeResponse, @@ -216,6 +588,49 @@ async def search_study( return pagination.response(crud.search_study(db, query.conditions)) +@router.post( + "/study/mongo_search", + tags=["study"], + name="Search for studies", + description="Faceted search of study data.", +) +async def mongo_search_study( + query: query.BiosampleSearchQuery = query.BiosampleSearchQuery(), + pagination: Pagination = Depends(), +): + settings = Settings() + client = MongoClient( + host=settings.mongo_host, + username=settings.mongo_user, + password=settings.mongo_password, + port=settings.mongo_port, + directConnection=True, + ) + + aggregation = [ + { + "$match": conditions_to_mongo_filter(query.conditions), + }, + { + "$unwind": { "path": "$study" }, + }, + { + "$group": { + "_id": "$study.id", + "study": { "$first": "$study" }, + }, + }, + { + "$replaceRoot": { "newRoot": "$study" }, + }, + ] + + return json.loads(bson.json_util.dumps({ + "count": list(client.nmdc[BIOSAMPLE_SEARCH_COLLECTION].aggregate([*aggregation, {"$count": "count"}]))[0]["count"], + "results": [doc for doc in client.nmdc[BIOSAMPLE_SEARCH_COLLECTION].aggregate([*aggregation, {"$skip": pagination.offset}, {"$limit": pagination.limit}])], + })) + + @router.post( "/study/facet", response_model=query.FacetResponse, @@ -226,6 +641,46 @@ async def facet_study(query: query.FacetQuery, db: Session = Depends(get_db)): return crud.facet_study(db, query.attribute, query.conditions) +@router.post( + "/study/mongo_facet", + tags=["study"], + name="Get all values of an attribute", +) +async def mongo_facet_study(query: query.FacetQuery): + settings = Settings() + client = MongoClient( + host=settings.mongo_host, + username=settings.mongo_user, + password=settings.mongo_password, + port=settings.mongo_port, + directConnection=True, + ) + aggregation = [ + { + "$match": conditions_to_mongo_filter(query.conditions), + }, + { + "$unwind": { "path": "$study" }, + }, + { + "$group": { + "_id": "$study.id", + "study": { "$first": "$study" }, + }, + }, + { + "$replaceRoot": { "newRoot": "$study" }, + }, + { + "$sortByCount": f"${query.attribute}", + }, + ] + + return json.loads(bson.json_util.dumps({ + "facets": { facet_value_to_key(facet["_id"]): facet["count"] for facet in client.nmdc[BIOSAMPLE_SEARCH_COLLECTION].aggregate(aggregation) }, + })) + + @router.post( "/study/binned_facet", response_model=query.BinnedFacetResponse, @@ -248,6 +703,27 @@ async def get_study(study_id: str, db: Session = Depends(get_db)): return db_study +@router.get( + "/mongo_study/{study_id}", + tags=["study"], +) +async def mongo_get_study(study_id: str, db: Session = Depends(get_db)): + settings = Settings() + client = MongoClient( + host=settings.mongo_host, + username=settings.mongo_user, + password=settings.mongo_password, + port=settings.mongo_port, + directConnection=True, + ) + + studies = list(client.nmdc.study_transformed.find({"id": study_id})) + if len(studies) == 0: + raise HTTPException(status_code=404, detail="Study not found") + + return json.loads(bson.json_util.dumps(studies[0])) + + @router.get("/study/{study_id}/image", tags=["study"]) async def get_study_image(study_id: str, db: Session = Depends(get_db)): image = crud.get_study_image(db, study_id) @@ -282,6 +758,41 @@ async def facet_omics_processing(query: query.FacetQuery, db: Session = Depends( return crud.facet_omics_processing(db, query.attribute, query.conditions) +@router.post( + "/omics_processing/mongo_facet", + tags=["omics_processing"], + name="Get all values of an attribute", +) +async def mongo_facet_omics_processing(query: query.FacetQuery): + settings = Settings() + client = MongoClient( + host=settings.mongo_host, + username=settings.mongo_user, + password=settings.mongo_password, + port=settings.mongo_port, + directConnection=True, + ) + + aggregation = [ + { + "$match": conditions_to_mongo_filter(query.conditions, "omics_processing"), + }, + ] + + aggregation += [ + { + "$sortByCount": f"${query.attribute}", + }, + ] + + print("mongo_facet_omics_processing:") + print(aggregation) + + return json.loads(bson.json_util.dumps({ + "facets": { facet_value_to_key(facet["_id"]): facet["count"] for facet in client.nmdc[OMICS_PROCESSING_SEARCH_COLLECTION].aggregate(aggregation) }, + })) + + @router.post( "/omics_processing/binned_facet", response_model=query.BinnedFacetResponse, @@ -373,6 +884,63 @@ def data_object_aggregation( return crud.aggregate_data_object_by_workflow(db, query.conditions) +@router.post( + "/data_object/mongo_workflow_summary", + tags=["data_object"], + name="Aggregate data objects by workflow", +) +def mongo_data_object_aggregation( + query: query.DataObjectQuerySchema = query.DataObjectQuerySchema(), +): + settings = Settings() + client = MongoClient( + host=settings.mongo_host, + username=settings.mongo_user, + password=settings.mongo_password, + port=settings.mongo_port, + directConnection=True, + ) + aggregation = [ + { + "$match": conditions_to_mongo_filter(query.conditions), + }, + { + "$unwind": { "path": "$data_object" }, + }, + { + "$group": { + "_id": "$data_object.id", + "data_object": { "$first": "$data_object" }, + }, + }, + { + "$replaceRoot": { "newRoot": "$data_object" }, + }, + { + "$set": { + "combined_type": { + "data_object_type": "$data_object_type", + "activity_type": "$activity_type", + }, + }, + }, + { + "$sortByCount": "$combined_type", + }, + ] + + result = dict() + for facet in client.nmdc[BIOSAMPLE_SEARCH_COLLECTION].aggregate(aggregation): + if "data_object_type" not in facet["_id"]: + # We are not considering data_objects without a data_object_type + continue + if result.get(facet["_id"]["activity_type"]) is None: + result[facet["_id"]["activity_type"]] = { "count": 0, "file_types": dict() } + result[facet["_id"]["activity_type"]]["file_types"][facet["_id"]["data_object_type"]] = facet["count"] + result[facet["_id"]["activity_type"]]["count"] += facet["count"] + + return json.loads(bson.json_util.dumps(result)) + @router.get("/principal_investigator/{principal_investigator_id}", tags=["principal_investigator"]) async def get_pi_image(principal_investigator_id: UUID, db: Session = Depends(get_db)): image = crud.get_pi_image(db, principal_investigator_id) diff --git a/nmdc_server/config.py b/nmdc_server/config.py index 11019cea..7b58e417 100644 --- a/nmdc_server/config.py +++ b/nmdc_server/config.py @@ -11,7 +11,7 @@ class Settings(BaseSettings): # Several different database urls are configured for different # environments. In production, only database_uri and ingest_database_uri # are used. - database_uri: str = "postgresql:///nmdc" + database_uri: str = "postgresql:///nmdc_testing" ingest_database_uri: str = "postgresql:///nmdc_testing" testing_database_uri: str = "postgresql:///nmdc_testing" diff --git a/nmdc_server/ingest/data_object.py b/nmdc_server/ingest/data_object.py index 14573302..c303476c 100644 --- a/nmdc_server/ingest/data_object.py +++ b/nmdc_server/ingest/data_object.py @@ -52,6 +52,9 @@ def load(db: Session, cursor: Cursor, file_types: List[Dict[str, Any]]): else: objects_without_type += 1 + obj["file_size_bytes"] = obj.get("file_size_bytes", 0) + if not obj["file_size_bytes"]: + logger.error("null value for file_size_bytes") db.add(DataObject(**obj)) if objects_without_type: diff --git a/nmdc_server/ingest/denormalize.py b/nmdc_server/ingest/denormalize.py new file mode 100644 index 00000000..ebb9581a --- /dev/null +++ b/nmdc_server/ingest/denormalize.py @@ -0,0 +1,367 @@ +from pymongo import MongoClient +import time +from typing import List, Dict +import envo + +start = time.time() + +client = MongoClient() + +envo.mongo_load(client) + +omics_types = [ + "Metagenome", + "Organic Matter Characterization", + "Metatranscriptome", + "Proteomics", + "Metabolomics", +] + +study_transformed_aggregation: List[Dict] = [ + { + "$lookup": { + "from": "biosample_transformed", + "localField": "id", + "foreignField": "part_of", + "as": "biosample", + }, + }, + { + "$lookup": { + "from": "omics_processing_set", + "localField": "biosample.id", + "foreignField": "has_input", + "as": "omics_processing", + }, + }, + # Count the number of each omics_type and get it in the form [{"type": "Metagenome", "count": 100}, ...]. + # This one is a kind of beast in mongo aggregation programming. + # + # The pipeline in procedural code is roughly: + # + # omics_type_count_map = reduce_to_count_of_each_omics_type(omics_processing_array) (e.g. {"Metagenome": 100, "Metabolomics": 50, ...}) + # key_value_array = object_to_key_value_array(omics_type_count_map) (e.g. [{"k": "Metagenome", "v": 100}, {"k": "Metabolomics", "v": 50}, ...]) + # type_count_array = rename_key_value_to_type_count(key_value_array) (e.g. [{"type": "Metagenome", "count": 100}, {"type": "Metabolomics", "count": 50}, ...]) + # omics_processing_counts = sort_by_omics_type_name(type_count_array) (e.g. [{"type": "Metabolomics", "count": 50}, {"type": "Metagenome", "count": 100}, ...]) + { + "$set": { + "omics_processing_counts": { + "$sortArray": { + "input": { + "$map": { + "input": { + "$objectToArray": { + "$reduce": { + "input": "$omics_processing", + "initialValue": { omics_type: 0 for omics_type in omics_types }, + "in": { + omics_type: { + "$cond": { + "if": {"$eq": ["$$this.omics_type.has_raw_value", omics_type] }, + "then": {"$add": [f"$$value.{omics_type}", 1]}, + "else": f"$$value.{omics_type}", + }, + } + for omics_type in omics_types + }, + }, + }, + }, + "in": { + "type": "$$this.k", + "count": "$$this.v", + }, + }, + }, + "sortBy": {"type": 1}, + }, + }, + }, + }, + { + "$unset": "omics_processing", + }, + # Count the number of biosamples + { + "$set": { + "sample_count": { + "$size": "$biosample" + } + } + }, + { + "$unset": "biosample", + }, + { + "$out": "study_transformed", + }, +] +print("Generating study_transformed...") +q = client.nmdc.study_set.aggregate(study_transformed_aggregation) +print("...done") + + + + +# A couple transforms needed for optimal queries +biosample_transformed_aggregation: List[Dict] = [ + # To filter by dates we need actual dates in the database + { + "$set": { + "collection_date.has_date_value": { + "$dateFromString": { + "dateString": "$collection_date.has_raw_value", + }, + }, + }, + }, + # Lookup related omics_processing temporarily to derive some summary properties + { + "$lookup": { + "from": "omics_processing_set", + "localField": "id", + "foreignField": "has_input", + "as": "omics_processing", + }, + }, + # Create an array of all the omics_processing types associated with this sample + { + "$set": { + "multiomics": { + "$sortArray": { + "input": { + # This set difference removes duplicate omics types and removes lipidomics + "$setDifference": [ + "$omics_processing.omics_type.has_raw_value", + ["Lipidomics"], + ], + }, + "sortBy": 1, + }, + }, + }, + }, + # Add a count so we can sort by the number of types of omics_processing each sample has + { + "$set": { + "multiomics_count": { + "$size": "$multiomics" + } + } + }, + # We don't want to actually store the related omics_processing + { + "$unset": "omics_processing", + }, + # Save the result + { + "$out": "biosample_transformed", + }, +] + +print("Generating biosample_transformed...") +q = client.nmdc.biosample_set.aggregate(biosample_transformed_aggregation) +print("...done") + +def denormalize_analysis_aggregation(base_type): + aggregation = [] + + activity_types = { + "mags_activity": "nmdc:MAGsAnalysisActivity", + "metabolomics_analysis_activity": "nmdc:MetabolomicsAnalysisActivity", + "metagenome_annotation_activity": "nmdc:MetagenomeAnnotation", + "metagenome_assembly": "nmdc:MetagenomeAssembly", + "metaproteomics_analysis_activity": "nmdc:MetaProteomicAnalysis", + "metatranscriptome_activity": "nmdc:metaT", + "nom_analysis_activity": "nmdc:NomAnalysisActivity", + } + + for activity_type in activity_types: + # Pull in activities and data_objects associated with each omics_processing + aggregation.extend([ + { + "$lookup": { + "from": f"{activity_type}_set", + "localField": "id" if base_type == "omics_processing" else "omics_processing.id", + "foreignField": "was_informed_by", + "as": activity_type, + }, + }, + # Move this to after all analyses are concatenated - can be one step to get all data_object records + { + "$lookup": { + "from": "data_object_set", + "localField": f"{activity_type}.has_output", + "foreignField": "id", + "as": f"{activity_type}_data_object", + "pipeline": [ + {"$set": {"activity_type": activity_types[activity_type]}}, + ], + }, + }, + ]) + + aggregation.extend([ + # Lookup metagenome annotations + { + "$lookup": { + "from": "functional_annotation_agg", + "localField": "metagenome_annotation_activity.id", + "foreignField": "metagenome_annotation_id", + "as": "metagenome_annotation", + "pipeline": [ + { + "$set": { + "id": "$gene_function_id", + "activity_id": "$metagenome_annotation_id", + }, + }, + {"$unset": ["_id", "metagenome_annotation_id", "gene_function_id"]}, + ], + }, + }, + # Lookup metaproteomics annotations + { + "$lookup": { + "from": "metap_gene_function_aggregation", + "localField": "metaproteomics_analysis_activity.id", + "foreignField": "metaproteomic_analysis_id", + "as": "metaproteomics_annotation", + "pipeline": [ + { + "$set": { + "id": "$gene_function_id", + "activity_id": "$metaproteomic_analysis_id", + }, + }, + {"$unset": ["_id", "metaproteomic_analysis_id", "gene_function_id"]}, + ], + }, + }, + # Combine annotations into a single annotation array + { + "$set": { + "gene_function": { + "$concatArrays": ["$metagenome_annotation", "$metaproteomics_annotation"] + } + }, + }, + { + "$unset": ["metagenome_annotation", "metaproteomics_annotation"], + }, + # Combine all activities into a single activity array + { + "$set": { + "activity": { + "$concatArrays": [f"${activity_type}" for activity_type in activity_types] + } + } + }, + # Remove the monstrous has_peptide_quantifications array to speed search + { + "$set": { + "activity": { + "$map": { + "input": "$activity", + "as": "d", + "in": { + "$setField": { + "field": "has_peptide_quantifications", + "value": "$$REMOVE", + "input": "$$d" + } + } + } + }, + } + }, + # We are done with the separate activity types since they are all in the activity array now + { + "$unset": list(activity_types.keys()), + }, + ]) + + aggregation.extend([ + # Combine all data objects into a single data_object array + { + "$set": { + "data_object": { + "$concatArrays": [f"${activity_type}_data_object" for activity_type in activity_types] + } + } + }, + # We no longer need the individual data_object fields + { + "$unset": [f"{activity_type}_data_object" for activity_type in activity_types] + }, + ]) + + return aggregation + + +biosample_denormalized_aggregation: List[Dict] = [ + { + "$lookup": { + "from": "study_transformed", + "localField": "part_of", + "foreignField": "id", + "as": "study", + }, + }, + { + "$lookup": { + "from": "omics_processing_set", + "localField": "id", + "foreignField": "has_input", + "as": "omics_processing", + }, + }, +] + +biosample_denormalized_aggregation += denormalize_analysis_aggregation("biosample") + +biosample_denormalized_aggregation += [ + { + "$out": "biosample_denormalized", + }, +] + +print("Generating biosample_denormalized...") +q = client.nmdc.biosample_transformed.aggregate(biosample_denormalized_aggregation) +print("...done") + + + +omics_processing_denormalized_aggregation: List[Dict] = [ + { + "$lookup": { + "from": "biosample_transformed", + "localField": "has_input", + "foreignField": "id", + "as": "biosample", + }, + }, + { + "$lookup": { + "from": "study_transformed", + "localField": "biosample.part_of", + "foreignField": "id", + "as": "study", + }, + }, +] + +omics_processing_denormalized_aggregation += denormalize_analysis_aggregation("omics_processing") + +omics_processing_denormalized_aggregation += [ + { + "$out": "omics_processing_denormalized", + }, +] + +print("Generating omics_processing_denormalized...") +q = client.nmdc.omics_processing_set.aggregate(omics_processing_denormalized_aggregation) +print("...done") + +end = time.time() +print(f"Completed in {end - start}s") diff --git a/nmdc_server/ingest/envo.py b/nmdc_server/ingest/envo.py index c718489b..027dbd46 100644 --- a/nmdc_server/ingest/envo.py +++ b/nmdc_server/ingest/envo.py @@ -4,6 +4,7 @@ from collections import defaultdict from dataclasses import dataclass from typing import Dict, List, Optional, Set +from pymongo import MongoClient from urllib import request from sqlalchemy.dialects.postgresql import insert @@ -295,3 +296,201 @@ def load(db: Session): populate_envo_ancestor(db, node, node, direct_ancestors, ids, True, set()) db.commit() + + + + + + +# -------------------------------------------------------------- +# mongo +# -------------------------------------------------------------- + + +def mongo_populate_envo_ancestor( + db: MongoClient, + term_id: str, + node: str, + edges: Dict[str, Set[str]], + all_nodes: Set[str], + direct: bool, + visited: Set[str], +): + if node in visited: + raise Exception(f"Cyclic graph detected ({node})") + if node not in edges: + return + + visited = visited.copy() + visited.add(node) + + for parent in edges[node]: + if parent not in all_nodes: + continue # skip ancestors outside the simplified hierarchy + + ancestor = dict(_id=dict(id=term_id, ancestor_id=parent), direct=direct) + db.nmdc.envo_ancestor.update_one({"_id": ancestor["_id"]}, {"$set": ancestor}, upsert=True) + + for parent in edges[node]: + if parent not in all_nodes: + continue # skip ancestors outside the simplified hierarchy + + mongo_populate_envo_ancestor(db, term_id, parent, edges, all_nodes, False, visited) + + +def mongo_get_biosample_roots(db: MongoClient) -> Dict[str, Set[str]]: + """ + Find all reachable envo root terms from each biosample envo facet. + + Returns a dict mapping facet name to the set of reachable roots. + """ + parents: Dict[str, str] = {} + query = db.nmdc.envo_ancestor.find(dict(direct=True)) + for ancestor in query: + parents[ancestor["_id"]["id"]] = ancestor["_id"]["ancestor_id"] + + def reachable_roots(attr: str) -> Set[str]: + query = db.nmdc.biosample_set.distinct(attr) + terms = set(r[0] for r in query) - {None} + roots = set() + + for term in terms: + # traverse up the ancestors until reaching a root + if term not in parents: + roots.add(term) + else: + parent = parents[term] + while parent in parents: + parent = parents[parent] + roots.add(parent) + return roots + + # TODO should we store these results in the database? + return { + key: reachable_roots(key) + for key in ["env_broad_scale.term.id", "env_local_scale.term.id", "env_medium.term.id"] + } + + +def _mongo_build_envo_subtree(db: MongoClient, parent_id: str) -> None: + db.nmdc.envo_ancestor.find({"_id.ancestor_id": parent_id, "direct": True}) + for node in query: + db.nmdc.envo_tree.insert_one(dict(_id=node["_id"]["id"], parent_id=parent_id)) + _mongo_build_envo_subtree(db, node.id) + + +def mongo_build_envo_trees(db: Session) -> None: + """ + Convert the envo_ancestors graph into trees, and store them (normalized). + + If a node is encountered more than once, we arbitrarily choose its first + encountered location in the graph. + + This should only be called after biosamples have been ingested. + """ + db.nmdc.envo_tree.drop() + + roots = mongo_get_biosample_roots(db) + root_set = set(itertools.chain(*roots.values())) + for root in root_set: + db.nmdc.nmdc_tree.insert_one( + { + "_id": root, + "parent_id": None, # null parent_id indicates root node(s) + } + ) + _mongo_build_envo_subtree(db, root) + + mongo_nested_envo_trees.cache_clear() + + +def _mongo_get_trees_for_facet( + db: MongoClient, + facet: str, + tree_nodes: Dict[str, _NodeInfo], + tree_children: TreeChildren, +) -> List[EnvoTreeNode]: + """ + Get the pruned trees for each facet. + + This is a pure function. + """ + query = db.nmdc.biosample_set.distinct(facet) + present_terms = set(r[0] for r in query) - {None} + reachable: Set[str] = set() + + # Find all nodes that are reachable from the set of present terms + for term in present_terms: + node = tree_nodes[term] + reachable.add(node.id) + while node.parent_id is not None: + node = tree_nodes[node.parent_id] + reachable.add(node.id) + + # Recursively build the tree structure + root_nodes = _nested_envo_subtree(tree_children, reachable) + + # Prune useless internal nodes from the roots + for root in root_nodes: + # TODO I don't have a mathematical proof, but I think we might not need + # this loop; because of the algorithm and tree structure, it might always get + # pruned in a single pass. Not sure though. + while _prune_useless_nodes(root, present_terms): + pass + + return [_prune_useless_roots(root, present_terms) for root in root_nodes] + + +@functools.lru_cache(maxsize=None) +def mongo_nested_envo_trees() -> Dict[str, List[EnvoTreeNode]]: + tree_children = defaultdict(list) + tree_nodes: Dict[str, _NodeInfo] = {} + + with SessionLocal() as session: + query = session.query(EnvoTerm, EnvoTree).filter(EnvoTerm.id == EnvoTree.id) + for term, edge in query: + node = _NodeInfo(id=edge.id, parent_id=edge.parent_id, label=term.label) + tree_children[edge.parent_id].append(node) + tree_nodes[edge.id] = node + + return { + facet: _get_trees_for_facet(session, facet, tree_nodes, tree_children) + for facet in ["env_broad_scale_id", "env_local_scale_id", "env_medium_id"] + } + + +def mongo_load(db: MongoClient): + db.nmdc.envo_term.drop() + db.nmdc.envo_ancestor.drop() + + with request.urlopen(envo_url) as r: + envo_data = json.load(r) + + for graph in envo_data["graphs"]: + direct_ancestors: Dict[str, Set[str]] = defaultdict(set) + for edge in graph["edges"]: + if edge["pred"] != "is_a": + continue + + id = edge["sub"].split("/")[-1].replace("_", ":") + parent = edge["obj"].split("/")[-1].replace("_", ":") + if id != parent: + direct_ancestors[id].add(parent) + + ids: Set[str] = set() + for node in graph["nodes"]: + if not node["id"].startswith("http://purl.obolibrary.org/obo/"): + continue + + id = node["id"].split("/")[-1].replace("_", ":") + label = node.pop("lbl", "") + data = node.get("meta", {}) + envo_data = dict(_id=id, label=label, data=data) + db.nmdc.envo_term.insert_one(envo_data) + ids.add(id) + + for node in ids: + ancestor_data = dict(_id=dict(id=node, ancestor_id=node), direct=False) + db.nmdc.envo_ancestor.insert_one(ancestor_data) + mongo_populate_envo_ancestor(db, node, node, direct_ancestors, ids, True, set()) + diff --git a/nmdc_server/ingest/mongo_data_portal.md b/nmdc_server/ingest/mongo_data_portal.md new file mode 100644 index 00000000..56587f06 --- /dev/null +++ b/nmdc_server/ingest/mongo_data_portal.md @@ -0,0 +1,106 @@ +# MongoDB-based Data Portal + +There are now API endpoints and UI changes to support retrieving data directly from MongoDB, eliminating the need to ingest into Postgres. +This new experimental mode requires additional denormalized collections in MongoDB for efficient search. +Note that this does not alleviate the dependency on Postgres, which is still used for the following: +* User login +* Download counts +* Submission portal submissions + +This is implemented as a feature flag that can be changed at runtime to compare Postgres-based and MongoDB-based search and UI. + +The following are still required for feature parity with the existing data portal: +- [x] Study detail page +- [ ] Biosample detail page + - [x] Initial page + - [ ] Add Depth + - [ ] Add Lat/long + - [ ] Add Envo terms + - [ ] Add study ID + - [ ] Add collection date + - [ ] Add open in Gold + - [ ] Add more alternate identifiers + - [ ] Remove type + - [ ] Remove community + - [ ] Remove habitat + - [ ] Remove NCBI taxonomy + - [ ] Remove sample collection site + - [ ] Remove samp name + - [ ] Remove location + - [ ] Remove multiomics count +- [x] Sankey +- [ ] Dynamic ENVO term trees in field flyout +- [ ] Make `envo.py` not rely on postgres Biosample table +- [ ] Autocomplete search +- [ ] Associating omics processing through analytic samples (borrow ingest code) +- [ ] Associating `M` and `MAP` terms in KEGG search +- [ ] Robust performance comparison +- [ ] Show studies with no samples +- [ ] Fix names in UI to not include `has_raw_value` +- [ ] Fix permalink serialization +- [ ] Upgrade to latest data +- [ ] Merge in recent changes +- [ ] Cache Mongo connection + +# Denormalized MongoDB + +To run denormalization, execute: + +``` +python denormalize.py +``` + +This will create the following new Mongo collections to support faceted search in the data portal: + +## `study_tranformed` +This collection is required to efficiently summarize a study. Its documents are available in the `biosample_denormalized` and `omics_processing_denormalized` collections, and it is used in the following endpoints: +* `/mongo_study/{study_id}` (used on study details page) + +This collection is identical to the `study_set` collection with the following additional fields: +* `omics_processing_counts`: An array of objects with `type` and `count` fields for each omics processing associated with samples in the study, e.g. `[{"type": "Metabolomics", "count": 50}, {"type": "Metagenome", "count": 100}, ...]` +* `sample_count`: The number of biosamples associated with the study + +## `biosample_tranformed` +This collection is required to efficiently query biosamples. Its documents are available in the `biosample_denormalized` and `omics_processing_denormalized` collections. + +This collection is identical to the `biosample_set` collection with the following additional fields: +* `collection_date.has_date_value`: The value of the collection date converted to a `Date` object for binning and search +* `multiomics`: An array of strings containing the types of `omics_processing` objects associated with the biosample (e.g. `["Metagenomics", "Proteomics"]`) +* `multiomics_count`: The length of the `multiomics` field, so that biosamples can be sorted in "more-omics-types-first" order + +## `biosample_denormalized` + +This collection is required to efficiently lookup sample object and counts without costly mongodb joins (i.e. `$lookup` aggregations). It is used in the following endpoints: +* `/environment/mongo_geospatial` +* `/biosample/mongo_search` +* `/biosample/mongo_facet` +* `/biosample/mongo_binned_facet` +* `/study/mongo_search` +* `/study/mongo_facet` +* `/data_object/mongo_workflow_summary` + +This collection is identical to the `biosample_transformed` collection with the following additional fields: +* `study`: The study records from `study_set` whose `id` matches the biosample `part_of` +* `omics_processing`: An array of omics processing records from `omics_processing_set` whose `has_input` matches the biosample `id` +* `analysis`: An array of analysis records from the following collections whose `was_informed_by` field matches the `id` of one of the `omics_processing` records associated with this biosample + * `mags_activity_set` + * `metabolomics_analysis_activity_set` + * `metagenome_annotation_activity_set` + * `metagenome_assembly_set` + * `metaproteomics_analysis_activity_set` + * `metatranscriptome_activity_set` + * `nom_analysis_activity_set` +* `data_object`: An array of data object records whose `id` matches the `has_output` field of one of the `analysis` records associated with this biosample +* `gene_function`: An array of gene function records from `functional_annotation_agg` or `metap_gene_function_aggregation` whose `metagenome_annotation_id` or `metaproteomic_analysis_id` field matches the `id` of one of the `analysis` records associated with this biosample + +## `omics_processing_denormalized` + +This collection is required to efficiently lookup omics processing counts without costly mongodb joins (i.e. `$lookup` aggregations). It is used in the following endpoints: +* `/omics_processing/mongo_facet` + +This collection is identical to the `omics_processing_set` collection with the following additional fields: +* `biosample`: The biosample records from `biosample_transformed` whose `id` matches the `has_input` field of this omics processing +* `study`: The study records from `study_set` whose `id` matches the `part_of` field of the `biosample` record associated with this omics processing +* `analysis`: An array of analysis records from the analysis collections (see `biosample_denormalized` above) whose `was_informed_by` field matches the `id` of this omics processing +* `data_object`: Derived similarly to `biosample_denormalized` above +* `gene_function`: Derived similarly to `biosample_denormalized` above diff --git a/nmdc_server/query.py b/nmdc_server/query.py index 4326178c..4d726feb 100644 --- a/nmdc_server/query.py +++ b/nmdc_server/query.py @@ -234,7 +234,7 @@ def compare(self) -> ClauseElement: # A special condition type on multiomics bitstrings class MultiomicsConditionSchema(BaseConditionSchema): table: Table - value: int + value: Union[int, List[str]] field: Literal["multiomics"] op: Literal["has"] diff --git a/setup.py b/setup.py index 43522b8c..8df93ebd 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,8 @@ "authlib==0.15.5", "celery[redis]", "click", - "cryptography<3.4", # https://github.com/pyca/cryptography/issues/5771 + # "cryptography<3.4", # https://github.com/pyca/cryptography/issues/5771 + "cryptography", "fastapi==0.71.0", "factory-boy==3.2.1", "httpx<=0.18.2", diff --git a/web/.nvmrc b/web/.nvmrc new file mode 100644 index 00000000..19c7bdba --- /dev/null +++ b/web/.nvmrc @@ -0,0 +1 @@ +16 \ No newline at end of file diff --git a/web/src/components/ClusterMap.vue b/web/src/components/ClusterMap.vue index 1e3bbaf7..8a1974fc 100644 --- a/web/src/components/ClusterMap.vue +++ b/web/src/components/ClusterMap.vue @@ -93,11 +93,13 @@ export default defineComponent({ const values: any[] = []; data.forEach((cluster, index) => { for (let i = 0; i < cluster.count; i += 1) { - values.push({ - ...cluster, - key: `${index}_${i}`, - latLng: L.latLng(cluster.latitude, cluster.longitude), - }); + if (cluster.latitude !== undefined && cluster.longitude !== undefined) { + values.push({ + ...cluster, + key: `${index}_${i}`, + latLng: L.latLng(cluster.latitude, cluster.longitude), + }); + } } }); mapData.value = values; @@ -108,7 +110,7 @@ export default defineComponent({ if (bounds) { emit('selected', [ { - field: 'latitude', + field: 'lat_lon.latitude', op: 'between', value: [ // @ts-ignore @@ -117,7 +119,7 @@ export default defineComponent({ table: 'biosample', }, { - field: 'longitude', + field: 'lat_lon.longitude', op: 'between', value: [ // @ts-ignore diff --git a/web/src/components/DataObjectTable.vue b/web/src/components/DataObjectTable.vue index 0fbef5c3..bb783473 100644 --- a/web/src/components/DataObjectTable.vue +++ b/web/src/components/DataObjectTable.vue @@ -6,7 +6,7 @@ import { flattenDeep } from 'lodash'; import { DataTableHeader } from 'vuetify'; import { humanFileSize } from '@/data/utils'; -import { OmicsProcessingResult } from '@/data/api'; +import { BiosampleSearchResult, OmicsProcessingResult } from '@/data/api'; import { stateRefs, acceptTerms } from '@/store'; import DownloadDialog from './DownloadDialog.vue'; @@ -29,8 +29,8 @@ export default defineComponent({ components: { DownloadDialog }, props: { - omicsProcessing: { - type: Array as PropType, + biosample: { + type: Object as PropType, required: true, }, omicsType: { @@ -86,16 +86,23 @@ export default defineComponent({ }); const items = computed(() => flattenDeep( - flattenDeep(props.omicsProcessing.map((p) => (p.omics_data))) - .map((omics_data) => omics_data.outputs - .filter((data) => data.file_type && data.file_type_description) - .map((data_object, i) => ({ - ...data_object, - omics_data, - /* TODO Hack to replace metagenome with omics type name */ - group_name: omics_data.name.replace('Metagenome', props.omicsType), - newgroup: i === 0, - }))), + flattenDeep(props.biosample.omics_processing.filter((o) => o.omics_type.has_raw_value === props.omicsType).map( + (omicsProcessing) => props.biosample.activity.filter( + (activity) => activity.was_informed_by === omicsProcessing.id, + ), + )).map( + (activity) => activity.has_output.map( + (dataObjectId) => props.biosample.data_object.find( + (dataObject) => dataObject.id === dataObjectId, + ), + ).map((dataObject, index) => ({ + ...dataObject, + activity, + /* TODO Hack to replace metagenome with omics type name */ + group_name: activity.name ? activity.name.replace('Metagenome', props.omicsType) : '', + newgroup: index === 0, + })), + ), )); function download(item: OmicsProcessingResult) { @@ -168,10 +175,10 @@ export default defineComponent({ This file is included in the currently selected bulk download - {{ item.file_type }} - {{ item.file_type_description }} + {{ item.data_object_type }} + TODO: lookup from schema {{ humanFileSize(item.file_size_bytes ) }} - {{ item.downloads }} + TODO { - let t = stateRefs.treeData.value?.trees[`${props.field}_id`]; + let t = stateRefs.treeData.value?.trees[`${props.field.split('.')[0]}_id`]; /* Eliminate nodes with only one child from the top */ while (t && t?.length === 1 && t[0].children?.length) { t = t[0].children; @@ -72,7 +72,7 @@ export default defineComponent({ c.push({ op: '==', field: field.value, - value: unreactive.nodeMapId[value].label, + value: unreactive.nodeMapId[value].id, table: table.value, }); }); @@ -87,8 +87,19 @@ export default defineComponent({ }; } + function facetCount(node: EnvoNode) { + let count = facetSummaryMap.value[node.id] || 0; + if (!node.children) { + return count; + } + node.children.forEach((child) => { + count += facetCount(child); + }); + return count; + } + return { - tree, selected, loading, facetSummaryMap, setSelected, normalizer, + tree, selected, loading, facetSummaryMap, setSelected, normalizer, facetCount, }; }, }); @@ -112,7 +123,7 @@ export default defineComponent({ @input="setSelected" > diff --git a/web/src/components/InvestigatorBio.vue b/web/src/components/InvestigatorBio.vue index 69d75c7b..c9f53338 100644 --- a/web/src/components/InvestigatorBio.vue +++ b/web/src/components/InvestigatorBio.vue @@ -35,9 +35,9 @@ export default defineComponent({ offset="1" > @@ -58,7 +58,7 @@ export default defineComponent({ >
- {{ item.principal_investigator_name }} + {{ item.principal_investigator.name || item.principal_investigator.has_raw_value }}
Principal investigator @@ -75,7 +75,7 @@ export default defineComponent({ /> props.item.alternate_identifiers + const alternateIdentifiers = computed(() => props.item.alternative_identifiers .map((id) => ({ name: id, target: `https://identifiers.org/${id}` }))); return { diff --git a/web/src/components/Presentation/ConditionChips.vue b/web/src/components/Presentation/ConditionChips.vue index 8e996c4c..301541b7 100644 --- a/web/src/components/Presentation/ConditionChips.vue +++ b/web/src/components/Presentation/ConditionChips.vue @@ -4,7 +4,6 @@ import Vue from 'vue'; import { groupBy } from 'lodash'; import { opMap } from '@/data/api'; import { fieldDisplayName } from '@/util'; -import { makeSetsFromBitmask } from '@/encoding'; export default Vue.extend({ props: { @@ -45,7 +44,7 @@ export default Vue.extend({ valueTransform(val, field, type) { // Special handling for multiomics if (field === 'multiomics' && type === 'biosample') { - return Array.from(makeSetsFromBitmask(val)).join(', '); + return val.join(', '); } // If it's not primitive if (val && typeof val === 'object') { diff --git a/web/src/components/Presentation/SearchResults.vue b/web/src/components/Presentation/SearchResults.vue index 7c9283d4..b8ca2ab8 100644 --- a/web/src/components/Presentation/SearchResults.vue +++ b/web/src/components/Presentation/SearchResults.vue @@ -16,9 +16,9 @@ export default Vue.extend({ type: Number, required: true, }, - titleKey: { - type: String, - default: 'name', + titleKeys: { + type: Array as PropType, + default: () => ['title', 'name'], }, subtitleKey: { type: String, @@ -41,6 +41,16 @@ export default Vue.extend({ default: false, }, }, + methods: { + getTitle(result: BaseSearchResult) { + for (let i = 0; i < this.$props.titleKeys.length; i += 1) { + if (result[this.$props.titleKeys[i]]) { + return result[this.$props.titleKeys[i]]; + } + } + return 'Untitled'; + }, + }, }); @@ -84,7 +94,7 @@ export default Vue.extend({ - {{ result[titleKey] }} + {{ getTitle(result) }} d) + .text((d) => multiomicsAbbreviations[d]) .attr('fill', 'black') .append('svg:title') .text((s) => props.tooltips[s]); @@ -153,16 +153,12 @@ export default defineComponent({ .attr('height', y.bandwidth()) .attr('fill', root.$vuetify.theme.currentTheme.blue) .classed('upset-bar-clickable', true) - .on('click', (event, values) => { - const value = values.sets.reduce((prev, cur) => { - const next = prev | MultiomicsValue[cur]; //eslint-disable-line no-bitwise - return next; - }, 0); + .on('click', (_event, values) => { const conditions = [{ field: 'multiomics', table: 'biosample', op: 'has', - value, + value: values.sets, }]; emit('select', { conditions }); }); diff --git a/web/src/components/SampleListExpansion.vue b/web/src/components/SampleListExpansion.vue index 239628f5..6ffbcb20 100644 --- a/web/src/components/SampleListExpansion.vue +++ b/web/src/components/SampleListExpansion.vue @@ -45,8 +45,8 @@ export default defineComponent({ const filteredOmicsProcessing = computed(() => Object.entries(groupBy( props.result.omics_processing - .filter((p) => hiddenOmicsTypes.indexOf(p.annotations.omics_type.toLowerCase()) === -1), - (p) => p.annotations.omics_type, + .filter((p) => hiddenOmicsTypes.indexOf(p.omics_type.has_raw_value.toLowerCase()) === -1), + (p) => p.omics_type.has_raw_value, )).sort(([agroup], [bgroup]) => { const ai = buttonOrder.indexOf(agroup.toLowerCase()); const bi = buttonOrder.indexOf(bgroup.toLowerCase()); @@ -86,7 +86,7 @@ export default defineComponent({ v-if="isOpen(projects[0].id)" :key="projects[0].id" class="flex-row mt-2" - :omics-processing="projects" + :biosample="result" :omics-type="omicsType" :logged-in-user="loggedInUser" /> diff --git a/web/src/data/api.ts b/web/src/data/api.ts index 438bd920..79b71581 100644 --- a/web/src/data/api.ts +++ b/web/src/data/api.ts @@ -1,4 +1,3 @@ -import { merge } from 'lodash'; import axios from 'axios'; import { setupCache } from 'axios-cache-adapter'; import NmdcSchema from 'nmdc-schema/jsonschema/nmdc.schema.json'; @@ -43,8 +42,7 @@ export interface BaseSearchResult { id: string; name: string; description: string; - alternate_identifiers: string[]; - annotations: Record; + alternative_identifiers: string[]; [key: string]: unknown; // possibly other things. } @@ -62,23 +60,22 @@ export interface DataObjectSearchResult extends BaseSearchResult { selected: boolean; } -export interface DerivedDataResult extends BaseSearchResult { +export interface AnalysisResult extends BaseSearchResult { type: string; git_url: string; started_at_time: string; ended_at_time: string; execution_resource: string; - omics_processing_id: string; - outputs: DataObjectSearchResult[]; + was_informed_by: string; + has_output: string[]; } export interface OmicsProcessingResult extends BaseSearchResult { - study_id: string; add_date: string; mod_date: string; open_in_gold: string; - omics_data: DerivedDataResult[]; - outputs: DataObjectSearchResult[]; // RAW outputs + omics_type: {has_raw_value: string}; + has_output: string[]; // RAW outputs } export interface BiosampleSearchResult extends BaseSearchResult { @@ -107,27 +104,32 @@ export interface BiosampleSearchResult extends BaseSearchResult { label: string; data: string; }; - omics_processing: OmicsProcessingResult[]; + geo_loc_name: { + has_raw_value: string; + } emsl_biosample_identifiers: string[]; + omics_processing: OmicsProcessingResult[]; + activity: AnalysisResult[]; + data_object: DataObjectSearchResult[]; } interface PrincipalInvestigator { name?: string; + has_raw_value?: string; email?: string; orcid?: string; + profile_image_url?: string; } export interface StudySearchResults extends BaseSearchResult { - principal_investigator_websites: string[]; - principal_investigator_name: string; - principal_investigator_image_url: string; - image_url: string; + websites: string[]; + study_image: {url: string}[]; principal_investigator: PrincipalInvestigator; - doi: string; + doi: {has_raw_value: string}; doi_map: Record, - publication_dois: string[]; + publications: string[]; omics_counts: { type: string; count: number; @@ -157,25 +159,25 @@ export interface StudySearchResults extends BaseSearchResult { }[]; } -export interface ReadsQCResult extends DerivedDataResult { +export interface ReadsQCResult extends AnalysisResult { stats: object; has_inputs: string[]; has_output: string[]; } -export interface MetagenomeAssembyResult extends DerivedDataResult { +export interface MetagenomeAssembyResult extends AnalysisResult { stats: object; has_inputs: string[]; has_output: string[]; } -export interface MetagenomeAnnotationResult extends DerivedDataResult { +export interface MetagenomeAnnotationResult extends AnalysisResult { stats: object; has_inputs: string[]; has_output: string[]; } -export type MetaproteomicAnalysisResult = DerivedDataResult; +export type MetaproteomicAnalysisResult = AnalysisResult; export interface UnitSchema { /* https://github.com/microbiomedata/nmdc-server/pull/350 */ @@ -459,38 +461,7 @@ async function getBinnedFacet( async function getDatabaseSummary(): Promise { const { data } = await client.get('summary'); - // TODO: fix this on the server - // merge this object with summary response - const mergeSummary = { - biosample: { - attributes: { - gold_classification: { - type: 'sankey-tree', - count: -1, - }, - env_broad_scale: { - type: 'tree', - count: -1, - }, - env_local_scale: { - type: 'tree', - count: -1, - }, - env_medium: { - type: 'tree', - count: -1, - }, - }, - }, - gene_function: { - attributes: { - id: { - type: 'kegg_search', - }, - }, - }, - }; - return merge(data, mergeSummary); + return data; } async function getDatabaseStats() { diff --git a/web/src/encoding.ts b/web/src/encoding.ts index 3cef4b29..cad2576b 100644 --- a/web/src/encoding.ts +++ b/web/src/encoding.ts @@ -18,7 +18,8 @@ export interface FieldsData { group?: string; hideAttr?: boolean; schemaName?: string; // Match the field to the nmsc schema property - encode?: (input: string) => string, + encode?: (input: string) => string; + has_raw_value?: string; } const KeggPrefix = { @@ -66,29 +67,6 @@ function stringIsKegg(v: string) { return Object.values(KeggPrefix).find((item) => v.match(item.pattern)); } -function makeSetsFromBitmask(mask_str: string) { - const mask = parseInt(mask_str, 10); // the bitmask comes in as a string - const sets = []; - - /* eslint-disable no-bitwise */ - if (1 & mask) { - sets.push('NOM'); - } - if ((1 << 4) & mask) { - sets.push('MB'); - } - if ((1 << 2) & mask) { - sets.push('MP'); - } - if ((1 << 1) & mask) { - sets.push('MT'); - } - if ((1 << 3) & mask) { - sets.push('MG'); - } - return sets; -} - const types: Record = { study: { icon: 'mdi-book', @@ -411,20 +389,19 @@ function getField(name: string, table?: entityType): FieldsData { return {}; } -const MultiomicsValue = { - MB: 0b10000, - MG: 0b01000, - MP: 0b00100, - MT: 0b00010, - NOM: 0b00001, +const multiomicsAbbreviations = { + Metagenome: 'MG', + Metatranscriptome: 'MT', + Proteomics: 'MP', + Metabolomics: 'MB', + 'Organic Matter Characterization': 'NOM', }; export { types, ecosystems, - MultiomicsValue, + multiomicsAbbreviations, getField, keggEncode, stringIsKegg, - makeSetsFromBitmask, }; diff --git a/web/src/plugins/utils.ts b/web/src/plugins/utils.ts index 16a60114..b734401b 100644 --- a/web/src/plugins/utils.ts +++ b/web/src/plugins/utils.ts @@ -36,6 +36,7 @@ function parseQuery(q: string) { const u8a = new Uint8Array(atob(b64).split('').map((c) => c.charCodeAt(0))); const msg = QueryParams.decode(u8a); const obj = QueryParams.toObject(msg, { enums: String }); + obj.conditions = obj.conditions ?? []; obj.conditions.forEach((c: Condition) => { // @ts-ignore // eslint-disable-next-line no-param-reassign diff --git a/web/src/util.js b/web/src/util.js index 44c2086b..c9d0cbfb 100644 --- a/web/src/util.js +++ b/web/src/util.js @@ -48,6 +48,9 @@ export function valueDisplayName(field, value) { if (field === 'file_size') { return filesize(value); } + if (value.has_raw_value) { + return `${value.has_raw_value}`; + } return `${value}`; } diff --git a/web/src/views/IndividualResults/IndividualTitle.vue b/web/src/views/IndividualResults/IndividualTitle.vue index 4f12004a..58adbc43 100644 --- a/web/src/views/IndividualResults/IndividualTitle.vue +++ b/web/src/views/IndividualResults/IndividualTitle.vue @@ -36,7 +36,7 @@ export default defineComponent({
- {{ item.annotations.title || item.name }} + {{ item.title || item.name }}
{ - const doiMap = _item?.doi_map; - if (doiMap) { - data.doiCitation = null; - data.publications = []; - data.doiCitation = CitationOverrides[_item.doi] || formatAPA(new Cite(_item.doi)); - data.publications = _item.publication_dois - .filter((doi) => doi in doiMap) - .map((doi) => formatAPA(new Cite(doiMap[doi]))); + data.doiCitation = null; + data.publications = []; + if (_item) { + data.doiCitation = formatAPA(new Cite(_item.doi.has_raw_value)); + data.publications = _item.publications.map((doi) => formatAPA(new Cite(doi))); } }); return { - CitationOverrides, GoldStudyLinkBase, goldLinks, data, @@ -181,7 +169,7 @@ export default defineComponent({