From a12d84391893faccce628b5ae7d47d79b854a52d Mon Sep 17 00:00:00 2001 From: Jing Date: Sat, 11 May 2024 14:19:26 -0400 Subject: [PATCH 01/14] add notesbooks for mongo validation and RDF gen --- .../notebooks/ghissue_401_sparql.ipynb | 623 ++++++++++++++++++ ...ion_referential_integrity-1715162638.ipynb | 491 ++++++++++++++ 2 files changed, 1114 insertions(+) create mode 100644 metadata-translation/notebooks/ghissue_401_sparql.ipynb create mode 100644 metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb new file mode 100644 index 00000000..bb7d6b60 --- /dev/null +++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb @@ -0,0 +1,623 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "51ea05af-7579-43ad-aa9c-3bf8b6da8fdb", + "metadata": {}, + "source": [ + "# Pipeline to transform the set of nmdc-schema-compliant mongodb collections to an RDF dataset amenable to SPARQL queries." + ] + }, + { + "cell_type": "markdown", + "id": "ae2673a5-560b-47b0-9608-656aa3854466", + "metadata": {}, + "source": [ + "Ensure that changes to the code will be import-able in this notebook without needing restart the kernel and thus lose state." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a362b42f-7ae0-40cf-91d4-8f19ca1087cf", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "id": "0b8b1fb7-2357-46ef-8d86-69cd1dce228d", + "metadata": {}, + "source": [ + "Connect to local dockerized dev environment." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "55932d03-802f-4efe-bceb-e1036cd35567", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MONGO_HOST=mongodb://localhost:27018\n" + ] + } + ], + "source": [ + "!env | grep MONGO_HOST" + ] + }, + { + "cell_type": "markdown", + "id": "3a146763-f03a-4d65-baa0-81ca15cba689", + "metadata": {}, + "source": [ + "Initialize a db connection." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bc72113f-5044-4646-a273-0692d2e650ea", + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "edb1bb42-005c-49ca-ba59-18c24833f93f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mongodb://localhost:27018\n", + "success\n" + ] + } + ], + "source": [ + "from nmdc_runtime.api.db.mongo import get_mongo_db\n", + "print(os.getenv(\"MONGO_HOST\"))\n", + "# start 12:23\n", + "mdb = get_mongo_db()\n", + "print(\"success\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "114d9ffa-a22a-48de-9001-d04cbab175eb", + "metadata": {}, + "outputs": [], + "source": [ + "from unittest.mock import patch\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "37dbc9a8-8cac-4798-8d4f-ccbd9c3560e9", + "metadata": {}, + "source": [ + "Get all populated nmdc-schema collections with entity `id`s." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a0dd489-74cc-47c4-b3e0-c97dd88f5b5f", + "metadata": {}, + "outputs": [], + "source": [ + "from nmdc_runtime.util import schema_collection_names_with_id_field\n", + "\n", + "populated_collections = sorted([\n", + " name for name in set(schema_collection_names_with_id_field()) & set(mdb.list_collection_names())\n", + " if mdb[name].estimated_document_count() > 0\n", + "])" + ] + }, + { + "cell_type": "markdown", + "id": "f9a45de7-ba27-4b18-8ff4-9ba44eeb1091", + "metadata": {}, + "source": [ + "Get a JSON-LD context for the NMDC Schema, to serialize documents to RDF." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ed72826-b552-4429-8ab5-9f7126821822", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import json\n", + "from pprint import pprint\n", + "\n", + "from linkml.generators.jsonldcontextgen import ContextGenerator\n", + "from nmdc_schema.nmdc_data import get_nmdc_schema_definition\n", + "\n", + "context = ContextGenerator(get_nmdc_schema_definition())\n", + "context = json.loads(context.serialize())[\"@context\"]\n", + "\n", + "for k, v in list(context.items()):\n", + " if isinstance(v, dict): #and v.get(\"@type\") == \"@id\":\n", + " v.pop(\"@id\", None) # use nmdc uri, not e.g. MIXS uri" + ] + }, + { + "cell_type": "markdown", + "id": "0800c5b9-d09e-4be1-899d-62fcf40a2c0e", + "metadata": {}, + "source": [ + "Ensure `nmdc:type` has a `URIRef` range, i.e. `nmdc:type a owl:ObjectProperty`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62a68c07-0706-4300-a48d-0ab628af87b1", + "metadata": {}, + "outputs": [], + "source": [ + "context['type'] = {'@type': '@id'}" + ] + }, + { + "cell_type": "markdown", + "id": "63fe4d54-0a41-4170-9310-45e5f47a6cb5", + "metadata": {}, + "source": [ + "Initialize an in-memory graph to store triples, prior to serializing to disk." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "648b4f70-34d6-4c70-8d0a-ef76e7e5d96d", + "metadata": {}, + "outputs": [], + "source": [ + "from rdflib import Graph\n", + "\n", + "g = Graph()" + ] + }, + { + "cell_type": "markdown", + "id": "05cb8fd0-b847-49fc-a472-a8df2426168a", + "metadata": {}, + "source": [ + "Define a helper function to speed up triplification process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d802017-2a7e-4614-b662-6a0cc027b8bc", + "metadata": {}, + "outputs": [], + "source": [ + "def split_chunk(seq, n: int):\n", + " \"\"\"\n", + " Split sequence into chunks of length n. Do not pad last chunk.\n", + " \n", + " >>> list(split_chunk(list(range(10)), 3))\n", + " [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]\n", + " \"\"\"\n", + " for i in range(0, len(seq), n):\n", + " yield seq[i : i + n]" + ] + }, + { + "cell_type": "markdown", + "id": "dfd91d37-b1c7-46ab-b30d-de80132ec091", + "metadata": {}, + "source": [ + "Use `rdflib` JSON-LD parsing to ingest mongo docs to in-memory graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86ff7261-e255-415d-a589-67637292dbdd", + "metadata": {}, + "outputs": [], + "source": [ + "from nmdc_runtime.util import collection_name_to_class_names\n", + "\n", + "def ensure_type(doc, collection_name):\n", + " if \"type\" in doc:\n", + " return doc\n", + "\n", + " class_names = collection_name_to_class_names[collection_name]\n", + " if len(class_names) > 1:\n", + " raise Exception(\"cannot unambiguously infer class of document\")\n", + " return assoc(doc, \"type\", class_names[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4251e0b1-35dc-4f40-91e7-b9bc0d9d79e1", + "metadata": {}, + "outputs": [], + "source": [ + "from toolz import assoc, dissoc\n", + "from tqdm.notebook import tqdm\n", + "\n", + "chunk_size = 2_000\n", + "total = sum((1 + mdb[name].estimated_document_count() // 2_000) for name in populated_collections)\n", + "\n", + "pbar = tqdm(total=total)\n", + "\n", + "for collection_name in populated_collections:\n", + " print(collection_name)\n", + " docs = [dissoc(doc, \"_id\") for doc in mdb[collection_name].find()]\n", + " chunks = list(split_chunk(docs, chunk_size))\n", + " for chunk in chunks:\n", + " typed_chunk = [ensure_type(doc, collection_name) for doc in chunk]\n", + " doc_jsonld = {\"@context\": context, \"@graph\": chunk}\n", + " g.parse(data=json.dumps(doc_jsonld), format='json-ld')\n", + " pbar.update(1)\n", + "print(f\"{len(g):,} triples loaded\")" + ] + }, + { + "cell_type": "markdown", + "id": "7140ef42-f94c-45c5-a0c1-31b05718aa4f", + "metadata": {}, + "source": [ + "Correct crazy URIs that end with newlines, which messes up graph serialization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba832848-2cc9-4d1d-bf5f-966a73e26658", + "metadata": {}, + "outputs": [], + "source": [ + "from rdflib import Namespace, RDF, Literal, URIRef\n", + "\n", + "NMDC = Namespace(\"https://w3id.org/nmdc/\")\n", + "\n", + "for s, p, o in tqdm(g, total=len(g)):\n", + " s_str = str(s)\n", + " if s_str.endswith(\"\\n\"):\n", + " s_str_fixed = str(s_str)[:-2]\n", + " g.remove((s,p,o))\n", + " g.add((URIRef(s_str_fixed), p,o))\n", + " if isinstance(o, URIRef):\n", + " o_str = str(o)\n", + " if o_str.endswith(\"\\n\"):\n", + " o_str_fixed = str(o_str)[:-2]\n", + " g.remove((s,p,o))\n", + " g.add((s, p, URIRef(o_str_fixed)))" + ] + }, + { + "cell_type": "markdown", + "id": "71893efc-8e19-465e-a33d-3fe6ee475e05", + "metadata": {}, + "source": [ + "Given a schema-collection entity (i.e. one with an `id` and its own mongo document), we want to easily find all other schema-collection entities to which it connects, via any slot.\n", + "\n", + "To do this, we first gather all schema classes that are the type of a schema-collection entity, as well as these class' ancestors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "831cbf19-8331-4f2d-814c-89d86d060029", + "metadata": {}, + "outputs": [], + "source": [ + "from linkml_runtime.utils.schemaview import SchemaView\n", + "\n", + "from nmdc_runtime.util import nmdc_schema_view, nmdc_database_collection_instance_class_names\n", + "\n", + "schema_view = nmdc_schema_view()\n", + "toplevel_classes = set()\n", + "for name in nmdc_database_collection_instance_class_names():\n", + " toplevel_classes |= set(schema_view.class_ancestors(name))" + ] + }, + { + "cell_type": "markdown", + "id": "acdc7a8c-a104-4ac4-b105-0daeaba598a4", + "metadata": {}, + "source": [ + "Next, we determine which slots have such a \"top-level\" class as its range." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d402b739-4ab8-4d93-b00f-76f677313c66", + "metadata": {}, + "outputs": [], + "source": [ + "slots = schema_view.all_slots()\n", + "\n", + "toplevel_entity_connectors = set()\n", + "for k, v in context.items():\n", + " if isinstance(v, dict) and \"@type\" in v and v[\"@type\"] == \"@id\":\n", + " if slots[k].range in toplevel_classes and slots[k].domain != \"Database\":\n", + " toplevel_entity_connectors.add(k)\n", + "print(toplevel_entity_connectors)" + ] + }, + { + "cell_type": "markdown", + "id": "40e58127-013e-40e2-a839-c9317e14c488", + "metadata": {}, + "source": [ + "Let's construct an entity-relationship diagram to visualize relationships." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c99cdd8d-5fd2-44eb-9090-af6f51770fbd", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# print(\"classDiagram\\n\")\n", + "# for slot_name in toplevel_entity_connectors:\n", + "# slot = slots[slot_name]\n", + "# domain = slot.domain or \"NamedThing\"\n", + "# range = slot.range\n", + "# print(f\"{domain} --> {range} : {slot_name}\")\n", + "\n", + "# print()\n", + "\n", + "# inheritance_links = set()\n", + "# for cls in toplevel_classes:\n", + "# ancestors = schema_view.class_ancestors(cls)\n", + "# for a in ancestors:\n", + "# if a != cls:\n", + "# inheritance_links.add(f\"{a} <|-- {cls}\")\n", + "\n", + "# for link in inheritance_links:\n", + "# print(link)" + ] + }, + { + "cell_type": "markdown", + "id": "63cb2cc8-ef99-4d5f-9ddf-9eb2949e9c06", + "metadata": {}, + "source": [ + "Now, let's assert a common `depends_on` relation for all entities connected by these slots so that we can traverse the graph of top-level entities without needing to specify any specific slot names." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc830d77-5ac2-482e-a4f9-dc2eed3f2ef9", + "metadata": {}, + "outputs": [], + "source": [ + "from rdflib import PROV\n", + "\n", + "for s, p, o in tqdm(g, total=len(g)):\n", + " if (connector := p.removeprefix(str(NMDC))) in toplevel_entity_connectors:\n", + " if connector == \"has_output\":\n", + " g.add((o, NMDC.depends_on, s))\n", + " else:\n", + " g.add((s, NMDC.depends_on, o))\n", + "\n", + "print(f\"{len(g):,} triples in total\")" + ] + }, + { + "cell_type": "markdown", + "id": "4b3dd01c-0f20-40c6-9066-793c9d33b901", + "metadata": {}, + "source": [ + "Materialize superclass relations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75db4baf-369b-47af-974b-f5298470ad7f", + "metadata": {}, + "outputs": [], + "source": [ + "schema_view = nmdc_schema_view()\n", + "toplevel_classes = set()\n", + "for name in nmdc_database_collection_instance_class_names():\n", + " toplevel_classes |= set(getattr(NMDC, a) for a in schema_view.class_ancestors(name))\n", + "\n", + "for s, p, o in tqdm(g, total=len(g)):\n", + " p_localname = p.removeprefix(str(NMDC))\n", + " if p_localname != \"type\":\n", + " continue\n", + " if o not in toplevel_classes:\n", + " continue\n", + " for a in schema_view.class_ancestors(o.removeprefix(str(NMDC))):\n", + " g.add((s, NMDC.type, getattr(NMDC,a)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "067a53a9-9220-4ee2-bcce-12d6007dab47", + "metadata": {}, + "outputs": [], + "source": [ + "len([t for t in g.subjects(NMDC.type, NMDC.Activity)])" + ] + }, + { + "cell_type": "markdown", + "id": "91171cf6-f435-4815-970f-a67f51254997", + "metadata": {}, + "source": [ + "Serialize and store as gzipped N-Triples file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "125d2ad4-8433-45d8-86c4-d6a619ea5280", + "metadata": {}, + "outputs": [], + "source": [ + "import gzip\n", + "\n", + "with gzip.open('data/nmdc-db.nt.gz', 'wb') as f:\n", + " f.write(g.serialize(format='nt').encode())" + ] + }, + { + "cell_type": "markdown", + "id": "c5d8c0a2-6f75-4dac-9bb9-ac48838ad2b8", + "metadata": {}, + "source": [ + "Wipe any existing persisted data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea0bdeee-6b3a-4074-bd73-cc9424569346", + "metadata": {}, + "outputs": [], + "source": [ + "!docker compose up fuseki -d\n", + "!docker exec fuseki rm -rf /fuseki-base/nmdc-db.tdb" + ] + }, + { + "cell_type": "markdown", + "id": "79284de7-ef52-47c6-aeb1-1453bd4b5f59", + "metadata": {}, + "source": [ + "Ensure data is present to load." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9037026c-2653-43e3-bb92-2a0eea85b213", + "metadata": {}, + "outputs": [], + "source": [ + "!docker cp data/nmdc-db.nt.gz fuseki:/fuseki-base/" + ] + }, + { + "cell_type": "markdown", + "id": "4dca86f8-6752-4aba-8d3c-656810f3af3f", + "metadata": {}, + "source": [ + "Take server down in order to bulk-load data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16f0621c-cf98-4a27-9165-7a0a8711db77", + "metadata": {}, + "outputs": [], + "source": [ + "!docker compose down fuseki" + ] + }, + { + "cell_type": "markdown", + "id": "fa4f9843-d5c0-4f8d-bcaf-ad2cf50c0264", + "metadata": {}, + "source": [ + "Bulk-load data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a490caff-af8a-4537-8c0b-e4a4752645bc", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!docker compose run fuseki ./apache-jena-4.9.0/bin/tdbloader --loc /fuseki-base/nmdc-db.tdb /fuseki-base/nmdc-db.nt.gz" + ] + }, + { + "cell_type": "markdown", + "id": "69d0e50c-102a-4a8e-9bcd-ef23600afd66", + "metadata": {}, + "source": [ + "Start up server." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a0bfb4b-e694-40b1-88af-4446e3fcc888", + "metadata": {}, + "outputs": [], + "source": [ + "!docker compose up fuseki -d" + ] + }, + { + "cell_type": "markdown", + "id": "8e528d6a-76b1-4629-82a1-58793ad6a481", + "metadata": {}, + "source": [ + "Now go to and SPARQL it up." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8695001d-9722-48a0-98e8-9ac5000551ea", + "metadata": {}, + "outputs": [], + "source": [ + "# 2024-03-14T09:40 : took <4min to run all the above." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb new file mode 100644 index 00000000..a4dca006 --- /dev/null +++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb @@ -0,0 +1,491 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2a66b2dc", + "metadata": {}, + "source": [ + "# imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f1c8bdb5", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f7ff0664-1881-4eca-b018-4c5856dc2489", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from linkml_runtime.utils.schemaview import SchemaView\n", + "from toolz import dissoc, assoc\n", + "from tqdm.notebook import tqdm\n", + "\n", + "from nmdc_runtime.api.db.mongo import get_mongo_db, nmdc_schema_collection_names\n", + "from nmdc_runtime.util import collection_name_to_class_names, nmdc_schema_view, nmdc_database_collection_instance_class_names\n", + "from nmdc_schema.nmdc_schema_accepting_legacy_ids import Database as NMDCDatabase\n", + "from nmdc_schema.get_nmdc_view import ViewGetter\n", + "\n", + "mdb = get_mongo_db()" + ] + }, + { + "cell_type": "markdown", + "id": "bcb5802b-8205-49b7-8784-dc137baff1a0", + "metadata": {}, + "source": [ + "# \"pre-cleaning\"" + ] + }, + { + "cell_type": "markdown", + "id": "8ecb1950-eaec-469c-b7ac-949650825093", + "metadata": {}, + "source": [ + "Only consider populated collections with `id` field." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "dde4c77e-5e06-4751-930a-95906cdf89c5", + "metadata": {}, + "outputs": [], + "source": [ + "collection_names = sorted(nmdc_schema_collection_names(mdb))\n", + "collection_names = [n for n in collection_names if mdb[n].find_one({\"id\": {\"$exists\": True}})]" + ] + }, + { + "cell_type": "markdown", + "id": "cddaaa54-262d-4549-a9a9-4c280a6a6341", + "metadata": {}, + "source": [ + "Remove null-valued optional properties" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b71ba7d2-ebd2-487d-a5cc-2a85ee14cb95", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7c9c772648214f1faec08df226b7b44b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/18 [00:00 10\u001b[0m nmdcdb \u001b[38;5;241m=\u001b[39m \u001b[43mNMDCDatabase\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m{\u001b[49m\u001b[43mcoll_name\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mdissoc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmdb\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcoll_name\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfind_one\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# except ValueError as e:\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# print(f\"no {coll_name}!\")\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# raise e\u001b[39;00m\n\u001b[1;32m 14\u001b[0m exemplar \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(nmdcdb, coll_name)[\u001b[38;5;241m0\u001b[39m]\n", + "File \u001b[0;32m:28\u001b[0m, in \u001b[0;36m__init__\u001b[0;34m(self, planned_process_set, functional_annotation_agg, activity_set, biosample_set, collecting_biosamples_from_site_set, data_object_set, extraction_set, field_research_site_set, functional_annotation_set, genome_feature_set, library_preparation_set, mags_activity_set, metabolomics_analysis_activity_set, metagenome_annotation_activity_set, metagenome_assembly_set, metagenome_sequencing_activity_set, metaproteomics_analysis_activity_set, metatranscriptome_activity_set, nom_analysis_activity_set, omics_processing_set, pooling_set, processed_sample_set, read_based_taxonomy_analysis_activity_set, read_qc_analysis_activity_set, study_set, **_kwargs)\u001b[0m\n", + "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:595\u001b[0m, in \u001b[0;36mDatabase.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_normalize_inlined_as_list(slot_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcollecting_biosamples_from_site_set\u001b[39m\u001b[38;5;124m\"\u001b[39m, slot_type\u001b[38;5;241m=\u001b[39mCollectingBiosamplesFromSite, key_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m, keyed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 593\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_normalize_inlined_as_list(slot_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata_object_set\u001b[39m\u001b[38;5;124m\"\u001b[39m, slot_type\u001b[38;5;241m=\u001b[39mDataObject, key_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m, keyed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 595\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_normalize_inlined_as_list\u001b[49m\u001b[43m(\u001b[49m\u001b[43mslot_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mextraction_set\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mslot_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mExtraction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mid\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeyed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 597\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_normalize_inlined_as_list(slot_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfield_research_site_set\u001b[39m\u001b[38;5;124m\"\u001b[39m, slot_type\u001b[38;5;241m=\u001b[39mFieldResearchSite, key_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m, keyed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 599\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunctional_annotation_agg, \u001b[38;5;28mlist\u001b[39m):\n", + "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/linkml_runtime/utils/yamlutils.py:97\u001b[0m, in \u001b[0;36mYAMLRoot._normalize_inlined_as_list\u001b[0;34m(self, slot_name, slot_type, key_name, keyed)\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_normalize_inlined_as_list\u001b[39m(\u001b[38;5;28mself\u001b[39m, slot_name: \u001b[38;5;28mstr\u001b[39m, slot_type: Type, key_name: \u001b[38;5;28mstr\u001b[39m, keyed: \u001b[38;5;28mbool\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 97\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_normalize_inlined\u001b[49m\u001b[43m(\u001b[49m\u001b[43mslot_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mslot_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeyed\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/linkml_runtime/utils/yamlutils.py:182\u001b[0m, in \u001b[0;36mYAMLRoot._normalize_inlined\u001b[0;34m(self, slot_name, slot_type, key_name, keyed, is_list)\u001b[0m\n\u001b[1;32m 179\u001b[0m form_1(list_entry)\n\u001b[1;32m 180\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 181\u001b[0m \u001b[38;5;66;03m# **kwargs\u001b[39;00m\n\u001b[0;32m--> 182\u001b[0m cooked_obj \u001b[38;5;241m=\u001b[39m \u001b[43mslot_type\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mas_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlist_entry\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 183\u001b[0m order_up(cooked_obj[key_name], cooked_obj)\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(list_entry, \u001b[38;5;28mlist\u001b[39m):\n\u001b[1;32m 185\u001b[0m \u001b[38;5;66;03m# *args\u001b[39;00m\n", + "File \u001b[0;32m:23\u001b[0m, in \u001b[0;36m__init__\u001b[0;34m(self, id, name, description, alternative_identifiers, designated_class, end_date, has_input, has_output, processing_institution, protocol_link, start_date, instrument_name, qc_status, qc_comment, has_failure_categorization, extractant, extraction_method, extraction_target, input_mass, volume, **_kwargs)\u001b[0m\n", + "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:3932\u001b[0m, in \u001b[0;36mExtraction.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m 3929\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume, QuantityValue):\n\u001b[1;32m 3930\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume \u001b[38;5;241m=\u001b[39m QuantityValue(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mas_dict(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume))\n\u001b[0;32m-> 3932\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__post_init__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3933\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdesignated_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclass_class_curie)\n", + "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:3849\u001b[0m, in \u001b[0;36mPlannedProcess.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m 3846\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m []\n\u001b[1;32m 3847\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization \u001b[38;5;241m=\u001b[39m [v \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(v, FailureCategorization) \u001b[38;5;28;01melse\u001b[39;00m FailureCategorization(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mas_dict(v)) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization]\n\u001b[0;32m-> 3849\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__post_init__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3850\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdesignated_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclass_class_curie)\n", + "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:828\u001b[0m, in \u001b[0;36mNamedThing.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m 825\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m []\n\u001b[1;32m 826\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers \u001b[38;5;241m=\u001b[39m [v \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(v, URIorCURIE) \u001b[38;5;28;01melse\u001b[39;00m URIorCURIE(v) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers]\n\u001b[0;32m--> 828\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__post_init__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/linkml_runtime/utils/yamlutils.py:48\u001b[0m, in \u001b[0;36mYAMLRoot.__post_init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 46\u001b[0m v \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mrepr\u001b[39m(kwargs[k])[:\u001b[38;5;241m40\u001b[39m]\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mn\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 47\u001b[0m messages\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mTypedNode\u001b[38;5;241m.\u001b[39myaml_loc(k)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m Unknown argument: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mk\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mv\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 48\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(messages))\n", + "\u001b[0;31mValueError\u001b[0m: Unknown argument: quality_control_report = {'status': 'pass'}" + ] + } + ], + "source": [ + "mdb.alldocs.drop()\n", + "\n", + "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n", + "pbar = tqdm(total=n_docs_total)\n", + "\n", + "#- for each collection name\n", + "for coll_name in collection_names:\n", + " pbar.set_description(f\"processing {coll_name}...\")\n", + " # try:\n", + " nmdcdb = NMDCDatabase(**{coll_name: [dissoc(mdb[coll_name].find_one(), '_id')]})\n", + " # except ValueError as e:\n", + " # print(f\"no {coll_name}!\")\n", + " # raise e\n", + " exemplar = getattr(nmdcdb, coll_name)[0]\n", + " newdoc_type = class_hierarchy_as_list(exemplar)\n", + " # for each doc in collection\n", + " mdb.alldocs.insert_many([assoc(dissoc(doc, 'type', '_id'), 'type', newdoc_type) for doc in mdb[coll_name].find()])\n", + " pbar.update(mdb[coll_name].estimated_document_count())\n", + "\n", + "pbar.close()\n", + "mdb.alldocs.create_index(\"id\") # WTF... nmdc:0078a0f981ad3f92693c2bc3b6470791 prevents mdb.alldocs.create_index(\"id\", unique=True)\n", + "print(\"refreshed `alldocs` collection\")" + ] + }, + { + "cell_type": "markdown", + "id": "ca194c0f-7417-41d2-bea8-a5a54392fee6", + "metadata": {}, + "source": [ + "# Validation" + ] + }, + { + "cell_type": "markdown", + "id": "ab859bb2-808c-48e2-8412-d8a3a79ca4e8", + "metadata": {}, + "source": [ + "Collect \"top level\" (nmdc:Database slot range) classes." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a2dbaf22-46e9-4de7-8288-05bc8cd2e5f8", + "metadata": {}, + "outputs": [], + "source": [ + "nmdc_view = nmdc_schema_view()\n", + "toplevel_classes = set()\n", + "for name in nmdc_database_collection_instance_class_names():\n", + " toplevel_classes |= set(nmdc_view.class_ancestors(name))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "969e13e0-25c0-4623-bcab-93097132924b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Activity',\n", + " 'Biosample',\n", + " 'BiosampleProcessing',\n", + " 'CollectingBiosamplesFromSite',\n", + " 'DataObject',\n", + " 'Extraction',\n", + " 'FieldResearchSite',\n", + " 'FunctionalAnnotation',\n", + " 'FunctionalAnnotationAggMember',\n", + " 'GenomeFeature',\n", + " 'LibraryPreparation',\n", + " 'MagsAnalysisActivity',\n", + " 'MaterialEntity',\n", + " 'MetabolomicsAnalysisActivity',\n", + " 'MetagenomeAnnotationActivity',\n", + " 'MetagenomeAssembly',\n", + " 'MetagenomeSequencingActivity',\n", + " 'MetaproteomicsAnalysisActivity',\n", + " 'MetatranscriptomeActivity',\n", + " 'NamedThing',\n", + " 'NomAnalysisActivity',\n", + " 'OmicsProcessing',\n", + " 'PlannedProcess',\n", + " 'Pooling',\n", + " 'ProcessedSample',\n", + " 'ReadBasedTaxonomyAnalysisActivity',\n", + " 'ReadQcAnalysisActivity',\n", + " 'Site',\n", + " 'Study',\n", + " 'WorkflowExecutionActivity'}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "toplevel_classes" + ] + }, + { + "cell_type": "markdown", + "id": "8645690e-7a9d-4f1e-8e62-0cbdde825890", + "metadata": {}, + "source": [ + "Referential integrity checking:\n", + "- \"naive\" errors collected in `not_found` list\n", + "- (hierarchy-aware) type errors (doc found, but of invalid type) collected in `invalid_type` list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "103d70b6-24ab-41bd-8b7f-d2faaa028bdf", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "errors = {\"not_found\": [], \"invalid_type\": []}\n", + "\n", + "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n", + "pbar = tqdm(total=n_docs_total)\n", + "\n", + "for name in sorted(collection_names):\n", + " cls_name = collection_name_to_class_names[name][0]\n", + " slot_map = {\n", + " slot.name: slot\n", + " for slot in nmdc_view.class_induced_slots(cls_name)\n", + " }\n", + " pbar.set_description(f\"processing {name}...\")\n", + " for doc in mdb[name].find():\n", + " doc = dissoc(doc, \"_id\")\n", + " for field, value in doc.items():\n", + " assert field in slot_map, f\"{name} doc {doc['id']}: field {field} not a valid slot\"\n", + " slot_range = str(slot_map[field].range)\n", + " assert slot_range, type(slot_range)\n", + " if not slot_range in toplevel_classes:\n", + " continue\n", + " if not isinstance(value, list):\n", + " value = [value]\n", + " for v in value:\n", + " if mdb.alldocs.find_one({\"id\": v}, [\"_id\"]) is None:\n", + " errors[\"not_found\"].append(f\"{name} doc {doc['id']}: field {field} referenced doc {v} not found\")\n", + " elif mdb.alldocs.find_one({\"id\": v, \"type\": slot_range}, [\"_id\"]) is None:\n", + " errors[\"invalid_type\"].append(f\"{name} doc {doc['id']}: field {field} referenced doc {v} not of type {slot_range}\")\n", + " pbar.update(1)\n", + "pbar.close() " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e01450d1-3369-4fc5-80be-9787e00a6597", + "metadata": {}, + "outputs": [], + "source": [ + "len(errors[\"not_found\"]), len(errors[\"invalid_type\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a25857f4-e26e-4896-9e5f-607e7b4bb07c", + "metadata": {}, + "outputs": [], + "source": [ + "errors[\"not_found\"][:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "855e232d-0e94-428e-96eb-0535c5135bee", + "metadata": {}, + "outputs": [], + "source": [ + "mdb.alldocs.find_one({\"id\": \"nmdc:mga0vx38\"}) is None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33516e3c-f10d-4c30-942b-0d01d06082f9", + "metadata": {}, + "outputs": [], + "source": [ + "errors[\"invalid_type\"][:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29ec7e82-d079-4525-bd7b-d770fd69d788", + "metadata": {}, + "outputs": [], + "source": [ + "# OmicsProcessing is not subclass of Activity (!)\n", + "mdb.alldocs.find_one({\"id\": \"emsl:570856\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "690ea8f8-05be-4d0a-aaa4-5c04aa4c640c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 71eec27a9baf424e23c3e4758aacc33524a22e38 Mon Sep 17 00:00:00 2001 From: Jing Date: Sat, 11 May 2024 14:26:54 -0400 Subject: [PATCH 02/14] add .tar .agz to gitignore --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index ed102ea7..a15a9316 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,10 @@ MANIFEST pip-log.txt pip-delete-this-directory.txt +# mongo-restore +*.tar +*.agz + # Unit test / coverage reports htmlcov/ .tox/ @@ -55,6 +59,8 @@ coverage.xml *.mo *.pot + + # Django stuff: *.log local_settings.py @@ -103,6 +109,7 @@ celerybeat.pid # Environments .env +.env.localhost .venv env/ venv/ From 3e039733341db0a6b65497646b697fe11d64cec3 Mon Sep 17 00:00:00 2001 From: Jing Date: Sat, 11 May 2024 14:51:13 -0400 Subject: [PATCH 03/14] add setup instructions --- ...ion_referential_integrity-1715162638.ipynb | 96 ++++++++----------- 1 file changed, 40 insertions(+), 56 deletions(-) diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb index a4dca006..e422ca42 100644 --- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb +++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb @@ -8,6 +8,18 @@ "# imports" ] }, + { + "cell_type": "markdown", + "id": "f52d1cd4-ca97-4f43-8923-a10847e86d4b", + "metadata": {}, + "source": [ + "Before running this notebook, make sure you have done the following:\n", + "- `make up-dev` has been run and mongo is mapped to `localhost:27018`\n", + "- a recent dump of the production mongo database has been loaded to `localhost:27018` (see `make mongorestore-nmdc-dev` for an example)\n", + "- .env has updated `MONGO_HOST` to `mongodb://localhost:27018`\n", + "- `export $(grep -v '^#' .env | xargs)` has been run in the shell before running `jupyter notebook`\n" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -15,6 +27,7 @@ "metadata": {}, "outputs": [], "source": [ + "# enable automatic reloading of modules before executing code\n", "%load_ext autoreload\n", "%autoreload 2" ] @@ -84,7 +97,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7c9c772648214f1faec08df226b7b44b", + "model_id": "2b29e7fd07ac46a1965108fe9b1f4531", "version_major": 2, "version_minor": 0 }, @@ -181,38 +194,7 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "ae8a6da2-6194-4aa5-aa36-8b21f5942b40", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'_id': ObjectId('64b59413fe178b5f0339ca41'),\n", - " 'end_date': '2018-05-08',\n", - " 'has_input': ['nmdc:procsm-11-dha8mw20'],\n", - " 'has_output': ['nmdc:procsm-11-xb11xa62'],\n", - " 'id': 'nmdc:extrp-11-k5fecy41',\n", - " 'processing_institution': 'Battelle',\n", - " 'quality_control_report': {'status': 'pass'},\n", - " 'start_date': '2017-06-07T20:26Z',\n", - " 'extraction_target': 'DNA',\n", - " 'input_mass': {'has_numeric_value': 0.25, 'has_unit': 'g'}}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mdb.extraction_set.estimated_document_count()\n", - "mdb.extraction_set.find_one()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "b2e618f3-78b9-42b6-8ea9-63d080b1b0f6", "metadata": { "scrolled": true @@ -221,35 +203,22 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a55f9c6b3933449897439966b9e5b1b7", + "model_id": "dcc57739dcdf47058fb4246f3e929aef", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0/171332 [00:00 10\u001b[0m nmdcdb \u001b[38;5;241m=\u001b[39m \u001b[43mNMDCDatabase\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m{\u001b[49m\u001b[43mcoll_name\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mdissoc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmdb\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcoll_name\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfind_one\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# except ValueError as e:\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# print(f\"no {coll_name}!\")\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# raise e\u001b[39;00m\n\u001b[1;32m 14\u001b[0m exemplar \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(nmdcdb, coll_name)[\u001b[38;5;241m0\u001b[39m]\n", - "File \u001b[0;32m:28\u001b[0m, in \u001b[0;36m__init__\u001b[0;34m(self, planned_process_set, functional_annotation_agg, activity_set, biosample_set, collecting_biosamples_from_site_set, data_object_set, extraction_set, field_research_site_set, functional_annotation_set, genome_feature_set, library_preparation_set, mags_activity_set, metabolomics_analysis_activity_set, metagenome_annotation_activity_set, metagenome_assembly_set, metagenome_sequencing_activity_set, metaproteomics_analysis_activity_set, metatranscriptome_activity_set, nom_analysis_activity_set, omics_processing_set, pooling_set, processed_sample_set, read_based_taxonomy_analysis_activity_set, read_qc_analysis_activity_set, study_set, **_kwargs)\u001b[0m\n", - "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:595\u001b[0m, in \u001b[0;36mDatabase.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_normalize_inlined_as_list(slot_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcollecting_biosamples_from_site_set\u001b[39m\u001b[38;5;124m\"\u001b[39m, slot_type\u001b[38;5;241m=\u001b[39mCollectingBiosamplesFromSite, key_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m, keyed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 593\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_normalize_inlined_as_list(slot_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata_object_set\u001b[39m\u001b[38;5;124m\"\u001b[39m, slot_type\u001b[38;5;241m=\u001b[39mDataObject, key_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m, keyed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 595\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_normalize_inlined_as_list\u001b[49m\u001b[43m(\u001b[49m\u001b[43mslot_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mextraction_set\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mslot_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mExtraction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mid\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeyed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 597\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_normalize_inlined_as_list(slot_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfield_research_site_set\u001b[39m\u001b[38;5;124m\"\u001b[39m, slot_type\u001b[38;5;241m=\u001b[39mFieldResearchSite, key_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m, keyed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 599\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunctional_annotation_agg, \u001b[38;5;28mlist\u001b[39m):\n", - "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/linkml_runtime/utils/yamlutils.py:97\u001b[0m, in \u001b[0;36mYAMLRoot._normalize_inlined_as_list\u001b[0;34m(self, slot_name, slot_type, key_name, keyed)\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_normalize_inlined_as_list\u001b[39m(\u001b[38;5;28mself\u001b[39m, slot_name: \u001b[38;5;28mstr\u001b[39m, slot_type: Type, key_name: \u001b[38;5;28mstr\u001b[39m, keyed: \u001b[38;5;28mbool\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 97\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_normalize_inlined\u001b[49m\u001b[43m(\u001b[49m\u001b[43mslot_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mslot_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeyed\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/linkml_runtime/utils/yamlutils.py:182\u001b[0m, in \u001b[0;36mYAMLRoot._normalize_inlined\u001b[0;34m(self, slot_name, slot_type, key_name, keyed, is_list)\u001b[0m\n\u001b[1;32m 179\u001b[0m form_1(list_entry)\n\u001b[1;32m 180\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 181\u001b[0m \u001b[38;5;66;03m# **kwargs\u001b[39;00m\n\u001b[0;32m--> 182\u001b[0m cooked_obj \u001b[38;5;241m=\u001b[39m \u001b[43mslot_type\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mas_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlist_entry\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 183\u001b[0m order_up(cooked_obj[key_name], cooked_obj)\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(list_entry, \u001b[38;5;28mlist\u001b[39m):\n\u001b[1;32m 185\u001b[0m \u001b[38;5;66;03m# *args\u001b[39;00m\n", - "File \u001b[0;32m:23\u001b[0m, in \u001b[0;36m__init__\u001b[0;34m(self, id, name, description, alternative_identifiers, designated_class, end_date, has_input, has_output, processing_institution, protocol_link, start_date, instrument_name, qc_status, qc_comment, has_failure_categorization, extractant, extraction_method, extraction_target, input_mass, volume, **_kwargs)\u001b[0m\n", - "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:3932\u001b[0m, in \u001b[0;36mExtraction.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m 3929\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume, QuantityValue):\n\u001b[1;32m 3930\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume \u001b[38;5;241m=\u001b[39m QuantityValue(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mas_dict(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume))\n\u001b[0;32m-> 3932\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__post_init__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3933\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdesignated_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclass_class_curie)\n", - "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:3849\u001b[0m, in \u001b[0;36mPlannedProcess.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m 3846\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m []\n\u001b[1;32m 3847\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization \u001b[38;5;241m=\u001b[39m [v \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(v, FailureCategorization) \u001b[38;5;28;01melse\u001b[39;00m FailureCategorization(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mas_dict(v)) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization]\n\u001b[0;32m-> 3849\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__post_init__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3850\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdesignated_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclass_class_curie)\n", - "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:828\u001b[0m, in \u001b[0;36mNamedThing.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m 825\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m []\n\u001b[1;32m 826\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers \u001b[38;5;241m=\u001b[39m [v \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(v, URIorCURIE) \u001b[38;5;28;01melse\u001b[39;00m URIorCURIE(v) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers]\n\u001b[0;32m--> 828\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__post_init__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/linkml_runtime/utils/yamlutils.py:48\u001b[0m, in \u001b[0;36mYAMLRoot.__post_init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 46\u001b[0m v \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mrepr\u001b[39m(kwargs[k])[:\u001b[38;5;241m40\u001b[39m]\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mn\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 47\u001b[0m messages\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mTypedNode\u001b[38;5;241m.\u001b[39myaml_loc(k)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m Unknown argument: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mk\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mv\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 48\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(messages))\n", - "\u001b[0;31mValueError\u001b[0m: Unknown argument: quality_control_report = {'status': 'pass'}" + "name": "stdout", + "output_type": "stream", + "text": [ + "refreshed `alldocs` collection\n" ] } ], @@ -296,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "a2dbaf22-46e9-4de7-8288-05bc8cd2e5f8", "metadata": {}, "outputs": [], @@ -309,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "969e13e0-25c0-4623-bcab-93097132924b", "metadata": {}, "outputs": [ @@ -348,7 +317,7 @@ " 'WorkflowExecutionActivity'}" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -374,7 +343,22 @@ "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5b6ac6cb87b44c28aa65e77f28e5900f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/224995 [00:00 Date: Sat, 11 May 2024 15:19:41 -0400 Subject: [PATCH 04/14] add comments and formatting --- ...ion_referential_integrity-1715162638.ipynb | 159 +++++++++++++----- 1 file changed, 116 insertions(+), 43 deletions(-) diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb index e422ca42..44e2f661 100644 --- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb +++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb @@ -110,6 +110,7 @@ } ], "source": [ + "# check these slots for null values for all docs in collection_names\n", "props = [\"used\", \"git_url\", \"was_associated_with\", \"was_generated_by\", \"compression_type\",]\n", "\n", "pbar = tqdm(total=len(collection_names))\n", @@ -158,7 +159,7 @@ "id": "5ed95ee0-03b7-4dff-80e7-92a2b24bccf4", "metadata": {}, "source": [ - "Define helper function." + "Define a helper function that takes a document and returns its class and all parent classes as a list" ] }, { @@ -194,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "id": "b2e618f3-78b9-42b6-8ea9-63d080b1b0f6", "metadata": { "scrolled": true @@ -203,7 +204,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "dcc57739dcdf47058fb4246f3e929aef", + "model_id": "e69c4fd820114e33b11ebae47f9f3e4d", "version_major": 2, "version_minor": 0 }, @@ -223,22 +224,28 @@ } ], "source": [ + "# drop any previously generated alldocs collection\n", "mdb.alldocs.drop()\n", "\n", + "# progress bar set-up\n", "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n", "pbar = tqdm(total=n_docs_total)\n", "\n", - "#- for each collection name\n", + "# for each collection name\n", "for coll_name in collection_names:\n", " pbar.set_description(f\"processing {coll_name}...\")\n", - " # try:\n", - " nmdcdb = NMDCDatabase(**{coll_name: [dissoc(mdb[coll_name].find_one(), '_id')]})\n", - " # except ValueError as e:\n", - " # print(f\"no {coll_name}!\")\n", - " # raise e\n", + " # for each doc in collection dissociate mongo-generated '_id' field\n", + " try:\n", + " nmdcdb = NMDCDatabase(**{coll_name: [dissoc(mdb[coll_name].find_one(), '_id')]})\n", + " except ValueError as e:\n", + " print(f\"no {coll_name}!\")\n", + " raise e\n", + " # calculate class_hierarchy_as_list once per collection \n", " exemplar = getattr(nmdcdb, coll_name)[0]\n", " newdoc_type = class_hierarchy_as_list(exemplar)\n", " # for each doc in collection\n", + " # replace string value for 'type' with a class_hierarchy_as_list\n", + " # and insert modified doc into materialized alldocs collection\n", " mdb.alldocs.insert_many([assoc(dissoc(doc, 'type', '_id'), 'type', newdoc_type) for doc in mdb[coll_name].find()])\n", " pbar.update(mdb[coll_name].estimated_document_count())\n", "\n", @@ -265,22 +272,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "id": "a2dbaf22-46e9-4de7-8288-05bc8cd2e5f8", "metadata": {}, - "outputs": [], - "source": [ - "nmdc_view = nmdc_schema_view()\n", - "toplevel_classes = set()\n", - "for name in nmdc_database_collection_instance_class_names():\n", - " toplevel_classes |= set(nmdc_view.class_ancestors(name))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "969e13e0-25c0-4623-bcab-93097132924b", - "metadata": {}, "outputs": [ { "data": { @@ -317,12 +311,17 @@ " 'WorkflowExecutionActivity'}" ] }, - "execution_count": 10, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "nmdc_view = nmdc_schema_view()\n", + "toplevel_classes = set()\n", + "for name in nmdc_database_collection_instance_class_names():\n", + " toplevel_classes |= set(nmdc_view.class_ancestors(name))\n", + "\n", "toplevel_classes" ] }, @@ -331,7 +330,7 @@ "id": "8645690e-7a9d-4f1e-8e62-0cbdde825890", "metadata": {}, "source": [ - "Referential integrity checking:\n", + "## Referential integrity checking:\n", "- \"naive\" errors collected in `not_found` list\n", "- (hierarchy-aware) type errors (doc found, but of invalid type) collected in `invalid_type` list" ] @@ -391,64 +390,138 @@ "pbar.close() " ] }, + { + "cell_type": "markdown", + "id": "9d2ce4a3-fb33-4b47-9c7f-a7919405ab65", + "metadata": {}, + "source": [ + "## Results" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "e01450d1-3369-4fc5-80be-9787e00a6597", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(4857, 23503)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "len(errors[\"not_found\"]), len(errors[\"invalid_type\"])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "a25857f4-e26e-4896-9e5f-607e7b4bb07c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['mags_activity_set doc nmdc:fdefb3fa15098906cf788f5cadf17bb3: field part_of referenced doc nmdc:mga0vx38 not found',\n", + " 'mags_activity_set doc nmdc:78f8bf24916f01d053378b1bd464cd8a: field has_input referenced doc nmdc:9003278a200d1e7921e978d4c59233c3 not found',\n", + " 'mags_activity_set doc nmdc:a57ecfc4dee4e6938a5517ad0961dcd8: field part_of referenced doc nmdc:mga08x19 not found',\n", + " 'mags_activity_set doc nmdc:3e0d8aae3b16d5bba2b3faec04391929: field part_of referenced doc nmdc:mga06z11 not found',\n", + " 'mags_activity_set doc nmdc:4417090e8ce0e96ff2867b85823d4b26: field part_of referenced doc nmdc:mga07m45 not found']" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "errors[\"not_found\"][:5]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "855e232d-0e94-428e-96eb-0535c5135bee", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "mdb.alldocs.find_one({\"id\": \"nmdc:mga0vx38\"}) is None" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "33516e3c-f10d-4c30-942b-0d01d06082f9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['data_object_set doc emsl:output_570856: field was_generated_by referenced doc emsl:570856 not of type Activity',\n", + " 'data_object_set doc emsl:output_570991: field was_generated_by referenced doc emsl:570991 not of type Activity',\n", + " 'data_object_set doc emsl:output_570998: field was_generated_by referenced doc emsl:570998 not of type Activity',\n", + " 'data_object_set doc emsl:output_570855: field was_generated_by referenced doc emsl:570855 not of type Activity',\n", + " 'data_object_set doc emsl:output_570823: field was_generated_by referenced doc emsl:570823 not of type Activity']" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "errors[\"invalid_type\"][:5]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "29ec7e82-d079-4525-bd7b-d770fd69d788", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'_id': ObjectId('663fbef9ba64633177320f59'),\n", + " 'id': 'emsl:570856',\n", + " 'name': 'Rachael_21T_04-15A_M_14Mar17_leopard_Infuse',\n", + " 'instrument_name': '21T Agilent',\n", + " 'has_input': ['emsl:2f71038a-5dd1-11ec-bf63-0242ac130002'],\n", + " 'has_output': ['emsl:output_570856'],\n", + " 'omics_type': {'has_raw_value': 'Organic Matter Characterization'},\n", + " 'part_of': ['gold:Gs0110138'],\n", + " 'description': 'High resolution MS spectra only',\n", + " 'processing_institution': 'EMSL',\n", + " 'gold_sequencing_project_identifiers': [],\n", + " 'type': ['OmicsProcessing', 'PlannedProcess', 'NamedThing']}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# OmicsProcessing is not subclass of Activity (!)\n", "mdb.alldocs.find_one({\"id\": \"emsl:570856\"})" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "690ea8f8-05be-4d0a-aaa4-5c04aa4c640c", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From f81dde6742b7ccfd2226b33c561aec3268101499 Mon Sep 17 00:00:00 2001 From: Jing Date: Tue, 14 May 2024 14:48:56 -0400 Subject: [PATCH 05/14] more comments --- .../notebooks/ghissue_401_sparql.ipynb | 340 ++++++++++++++---- ...ion_referential_integrity-1715162638.ipynb | 3 +- 2 files changed, 270 insertions(+), 73 deletions(-) diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb index bb7d6b60..2a5cac60 100644 --- a/metadata-translation/notebooks/ghissue_401_sparql.ipynb +++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb @@ -10,10 +10,16 @@ }, { "cell_type": "markdown", - "id": "ae2673a5-560b-47b0-9608-656aa3854466", + "id": "0675b9ba-c8be-478a-8c72-6edf10f56d8b", "metadata": {}, "source": [ - "Ensure that changes to the code will be import-able in this notebook without needing restart the kernel and thus lose state." + "## Setup\n", + "\n", + "Before running this notebook, make sure you have done the following:\n", + "- `make up-dev` has been run and mongo is mapped to `localhost:27018`\n", + "- a recent dump of the production mongo database has been loaded to `localhost:27018` (see `make mongorestore-nmdc-dev` for an example)\n", + "- .env has updated `MONGO_HOST` to `mongodb://localhost:27018`\n", + "- `export $(grep -v '^#' .env | xargs)` has been run in the shell before running `jupyter notebook`" ] }, { @@ -23,13 +29,14 @@ "metadata": {}, "outputs": [], "source": [ + "# Ensure code changes in this notebook will be import-able without needing to restart the kernel and lose state\n", "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "markdown", - "id": "0b8b1fb7-2357-46ef-8d86-69cd1dce228d", + "id": "3a456470-920d-4fd4-8040-e0bd3dcabff0", "metadata": {}, "source": [ "Connect to local dockerized dev environment." @@ -63,17 +70,7 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "bc72113f-5044-4646-a273-0692d2e650ea", - "metadata": {}, - "outputs": [], - "source": [ - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "edb1bb42-005c-49ca-ba59-18c24833f93f", "metadata": {}, "outputs": [ @@ -81,30 +78,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "mongodb://localhost:27018\n", "success\n" ] } ], "source": [ "from nmdc_runtime.api.db.mongo import get_mongo_db\n", - "print(os.getenv(\"MONGO_HOST\"))\n", - "# start 12:23\n", "mdb = get_mongo_db()\n", "print(\"success\")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "114d9ffa-a22a-48de-9001-d04cbab175eb", - "metadata": {}, - "outputs": [], - "source": [ - "from unittest.mock import patch\n", - "\n" - ] - }, { "cell_type": "markdown", "id": "37dbc9a8-8cac-4798-8d4f-ccbd9c3560e9", @@ -115,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "3a0dd489-74cc-47c4-b3e0-c97dd88f5b5f", "metadata": {}, "outputs": [], @@ -133,12 +116,12 @@ "id": "f9a45de7-ba27-4b18-8ff4-9ba44eeb1091", "metadata": {}, "source": [ - "Get a JSON-LD context for the NMDC Schema, to serialize documents to RDF." + "## Get a JSON-LD context for the NMDC Schema, to serialize documents to RDF" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "9ed72826-b552-4429-8ab5-9f7126821822", "metadata": { "scrolled": true @@ -169,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "62a68c07-0706-4300-a48d-0ab628af87b1", "metadata": {}, "outputs": [], @@ -182,12 +165,12 @@ "id": "63fe4d54-0a41-4170-9310-45e5f47a6cb5", "metadata": {}, "source": [ - "Initialize an in-memory graph to store triples, prior to serializing to disk." + "## Initialize an in-memory graph to store triples, prior to serializing to disk" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "648b4f70-34d6-4c70-8d0a-ef76e7e5d96d", "metadata": {}, "outputs": [], @@ -207,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "4d802017-2a7e-4614-b662-6a0cc027b8bc", "metadata": {}, "outputs": [], @@ -228,12 +211,12 @@ "id": "dfd91d37-b1c7-46ab-b30d-de80132ec091", "metadata": {}, "source": [ - "Use `rdflib` JSON-LD parsing to ingest mongo docs to in-memory graph." + "Define a helper function to ensure each doc has exactly one type." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "86ff7261-e255-415d-a589-67637292dbdd", "metadata": {}, "outputs": [], @@ -245,33 +228,83 @@ " return doc\n", "\n", " class_names = collection_name_to_class_names[collection_name]\n", + " \n", " if len(class_names) > 1:\n", " raise Exception(\"cannot unambiguously infer class of document\")\n", + " \n", " return assoc(doc, \"type\", class_names[0])" ] }, + { + "cell_type": "markdown", + "id": "7eedd442-0f26-4829-a878-cf066b3a3912", + "metadata": {}, + "source": [ + "## Ingest mongo docs to in-memory graph \n", + "Uses `rdflib` JSON-LD parsing" + ] + }, { "cell_type": "code", "execution_count": null, "id": "4251e0b1-35dc-4f40-91e7-b9bc0d9d79e1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d99c33f951874aea9a4f325086bde0d0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/124 [00:00 Activity : was_generated_by\n", + "Activity --> Activity : was_informed_by\n", + "FunctionalAnnotationAggMember --> WorkflowExecutionActivity : metagenome_annotation_id\n", + "NamedThing --> NamedThing : has_output\n", + "NamedThing --> NamedThing : part_of\n", + "Biosample --> FieldResearchSite : collected_from\n", + "NamedThing --> NamedThing : has_input\n", + "\n", + "MaterialEntity <|-- FieldResearchSite\n", + "Activity <|-- MetaproteomicsAnalysisActivity\n", + "NamedThing <|-- Site\n", + "NamedThing <|-- DataObject\n", + "NamedThing <|-- FieldResearchSite\n", + "MaterialEntity <|-- Site\n", + "Activity <|-- MetatranscriptomeActivity\n", + "NamedThing <|-- LibraryPreparation\n", + "WorkflowExecutionActivity <|-- MagsAnalysisActivity\n", + "NamedThing <|-- PlannedProcess\n", + "WorkflowExecutionActivity <|-- ReadBasedTaxonomyAnalysisActivity\n", + "Activity <|-- MetagenomeAssembly\n", + "WorkflowExecutionActivity <|-- NomAnalysisActivity\n", + "PlannedProcess <|-- Extraction\n", + "PlannedProcess <|-- LibraryPreparation\n", + "PlannedProcess <|-- Pooling\n", + "MaterialEntity <|-- ProcessedSample\n", + "BiosampleProcessing <|-- LibraryPreparation\n", + "NamedThing <|-- Biosample\n", + "NamedThing <|-- Pooling\n", + "NamedThing <|-- Extraction\n", + "Activity <|-- MagsAnalysisActivity\n", + "NamedThing <|-- MaterialEntity\n", + "MaterialEntity <|-- Biosample\n", + "WorkflowExecutionActivity <|-- ReadQcAnalysisActivity\n", + "NamedThing <|-- ProcessedSample\n", + "WorkflowExecutionActivity <|-- MetagenomeAnnotationActivity\n", + "NamedThing <|-- CollectingBiosamplesFromSite\n", + "NamedThing <|-- BiosampleProcessing\n", + "Activity <|-- NomAnalysisActivity\n", + "WorkflowExecutionActivity <|-- MetagenomeSequencingActivity\n", + "WorkflowExecutionActivity <|-- MetagenomeAssembly\n", + "WorkflowExecutionActivity <|-- MetatranscriptomeActivity\n", + "Activity <|-- ReadBasedTaxonomyAnalysisActivity\n", + "Activity <|-- MetagenomeAnnotationActivity\n", + "Activity <|-- WorkflowExecutionActivity\n", + "Site <|-- FieldResearchSite\n", + "BiosampleProcessing <|-- Pooling\n", + "PlannedProcess <|-- CollectingBiosamplesFromSite\n", + "Activity <|-- MetagenomeSequencingActivity\n", + "PlannedProcess <|-- BiosampleProcessing\n", + "WorkflowExecutionActivity <|-- MetabolomicsAnalysisActivity\n", + "WorkflowExecutionActivity <|-- MetaproteomicsAnalysisActivity\n", + "NamedThing <|-- OmicsProcessing\n", + "Activity <|-- MetabolomicsAnalysisActivity\n", + "NamedThing <|-- Study\n", + "Activity <|-- ReadQcAnalysisActivity\n", + "PlannedProcess <|-- OmicsProcessing\n" + ] + } + ], "source": [ - "# print(\"classDiagram\\n\")\n", - "# for slot_name in toplevel_entity_connectors:\n", - "# slot = slots[slot_name]\n", - "# domain = slot.domain or \"NamedThing\"\n", - "# range = slot.range\n", - "# print(f\"{domain} --> {range} : {slot_name}\")\n", + "print(\"classDiagram\\n\")\n", + "for slot_name in toplevel_entity_connectors:\n", + " slot = slots[slot_name]\n", + " domain = slot.domain or \"NamedThing\"\n", + " range = slot.range\n", + " print(f\"{domain} --> {range} : {slot_name}\")\n", "\n", - "# print()\n", + "print()\n", "\n", - "# inheritance_links = set()\n", - "# for cls in toplevel_classes:\n", - "# ancestors = schema_view.class_ancestors(cls)\n", - "# for a in ancestors:\n", - "# if a != cls:\n", - "# inheritance_links.add(f\"{a} <|-- {cls}\")\n", + "inheritance_links = set()\n", + "for cls in toplevel_classes:\n", + " ancestors = schema_view.class_ancestors(cls)\n", + " for a in ancestors:\n", + " if a != cls:\n", + " inheritance_links.add(f\"{a} <|-- {cls}\")\n", "\n", - "# for link in inheritance_links:\n", - "# print(link)" + "for link in inheritance_links:\n", + " print(link)" ] }, { @@ -404,15 +526,38 @@ "id": "63cb2cc8-ef99-4d5f-9ddf-9eb2949e9c06", "metadata": {}, "source": [ - "Now, let's assert a common `depends_on` relation for all entities connected by these slots so that we can traverse the graph of top-level entities without needing to specify any specific slot names." + "### Assert a common `depends_on` relation for all entities connected by `toplevel_entity_connectors`\n", + "This allows us to traverse the graph of top-level entities without needing to specify any specific slot names." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "cc830d77-5ac2-482e-a4f9-dc2eed3f2ef9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d7bb9d2404eb41159d8d03d895fa66ed", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/15851994 [00:00 Date: Tue, 14 May 2024 15:08:36 -0400 Subject: [PATCH 06/14] add comments --- .../notebooks/ghissue_401_sparql.ipynb | 40 ++++++++++++++----- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb index 2a5cac60..26f8194b 100644 --- a/metadata-translation/notebooks/ghissue_401_sparql.ipynb +++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb @@ -665,20 +665,32 @@ "id": "91171cf6-f435-4815-970f-a67f51254997", "metadata": {}, "source": [ - "## Serialize and store as gzipped N-Triples file." + "## Serialize and store as gzipped N-Triples file.\n", + "This can take a few minutes..." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "125d2ad4-8433-45d8-86c4-d6a619ea5280", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "serializing Graph and writing to file...\n", + "success!\n" + ] + } + ], "source": [ "import gzip\n", "\n", "with gzip.open('data/nmdc-db.nt.gz', 'wb') as f:\n", - " f.write(g.serialize(format='nt').encode())" + " print(\"Serializing graph and writing to file...\") \n", + " f.write(g.serialize(format='nt').encode())\n", + " print(\"Success!\")" ] }, { @@ -686,17 +698,25 @@ "id": "c5d8c0a2-6f75-4dac-9bb9-ac48838ad2b8", "metadata": {}, "source": [ - "## Load data into a fuseki server\n", - "\n", - "Wipe any existing persisted data." + "## Load data into a dockerized fuseki server\n", + "Spin up a `fuseki` container and wipe any existing persisted data." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "ea0bdeee-6b3a-4074-bd73-cc9424569346", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "no such service: fuseki\n", + "Error response from daemon: No such container: fuseki\n" + ] + } + ], "source": [ "!docker compose up fuseki -d\n", "!docker exec fuseki rm -rf /fuseki-base/nmdc-db.tdb" @@ -707,7 +727,7 @@ "id": "79284de7-ef52-47c6-aeb1-1453bd4b5f59", "metadata": {}, "source": [ - "Ensure data is present to load." + "Copy data into the `fuseki` container." ] }, { From 267777e4eff16c8e614cb4436270fdfe840bc710 Mon Sep 17 00:00:00 2001 From: Jing Date: Tue, 14 May 2024 15:49:35 -0400 Subject: [PATCH 07/14] add instructions for fuseki container --- .../notebooks/ghissue_401_sparql.ipynb | 169 ++++++++++++++++-- 1 file changed, 159 insertions(+), 10 deletions(-) diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb index 26f8194b..f2d87d3d 100644 --- a/metadata-translation/notebooks/ghissue_401_sparql.ipynb +++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb @@ -693,18 +693,157 @@ " print(\"Success!\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "9777151b-ddcb-472f-a71b-48af0224de53", + "metadata": {}, + "outputs": [], + "source": [ + "## Load data into a dockerized fuseki server\n", + "\n", + "1. Add the following to `/nmdc-runtime/docker-compose.yaml`.\n", + "\n", + "```yml\n", + " fuseki:\n", + " container_name: fuseki\n", + " build:\n", + " dockerfile: nmdc_runtime/fuseki.Dockerfile\n", + " context: .\n", + " ports:\n", + " - \"3030:3030\"\n", + " volumes:\n", + " - ./nmdc_runtime/site/fuseki/fuseki-config.ttl:/configuration/fuseki-config.ttl\n", + " - ./nmdc_runtime/site/fuseki/shiro.ini:/fuseki/run/shiro.ini\n", + " - nmdc_runtime_fuseki_data:/fuseki-base\n", + "```" + ] + }, { "cell_type": "markdown", "id": "c5d8c0a2-6f75-4dac-9bb9-ac48838ad2b8", "metadata": {}, "source": [ - "## Load data into a dockerized fuseki server\n", - "Spin up a `fuseki` container and wipe any existing persisted data." + "\n", + "\n", + "2. Add the following to `/nmdc-runtime/nmdc-runtime/fuseki.Dockerfile`\n", + "\n", + "```Dockerfile\n", + "# Use an appropriate base image that includes Java and wget\n", + "FROM openjdk:11-jre-slim\n", + "\n", + "# Set environment variables\n", + "ENV FUSEKI_VERSION 4.9.0\n", + "ENV FUSEKI_HOME /fuseki\n", + "\n", + "# Install wget\n", + "RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*\n", + "\n", + "# Download and extract Fuseki\n", + "RUN wget -qO- https://archive.apache.org/dist/jena/binaries/apache-jena-fuseki-$FUSEKI_VERSION.tar.gz | tar xvz -C / && \\\n", + " mv /apache-jena-fuseki-$FUSEKI_VERSION $FUSEKI_HOME\n", + "\n", + "# Expose the default port\n", + "EXPOSE 3030\n", + "\n", + "# Download and extract Jena Commands\n", + "RUN wget -qO- https://archive.apache.org/dist/jena/binaries/apache-jena-$FUSEKI_VERSION.tar.gz | tar xvz -C / && \\\n", + " mv /apache-jena-$FUSEKI_VERSION $FUSEKI_HOME\n", + "\n", + "# Copy the Fuseki configuration file to the container\n", + "COPY ./nmdc_runtime/site/fuseki/fuseki-config.ttl $FUSEKI_HOME/configuration/\n", + "COPY ./nmdc_runtime/site/fuseki/shiro.ini $FUSEKI_HOME/run/\n", + "\n", + "# Set working directory\n", + "WORKDIR $FUSEKI_HOME\n", + "\n", + "# Command to start Fuseki server with preloaded data\n", + "CMD [\"./fuseki-server\", \"--config\", \"configuration/fuseki-config.ttl\"]\n", + "```\n", + "\n", + "3. Add the following to `/nmdc-runtime/nmdc-runtime/site/fuseki/shiro.ini`\n", + "```ini\n", + "[main]\n", + "localhost=org.apache.jena.fuseki.authz.LocalhostFilter\n", + "\n", + "[urls]\n", + "## Control functions open to anyone\n", + "/$/server = anon\n", + "/$/ping = anon\n", + "/$/stats = anon\n", + "/$/stats/* = anon\n", + "## and the rest are restricted to localhost\n", + "/$/** = anon\n", + "/**=anon\n", + "```\n", + "\n", + "5. Add the following to `/nmdc-runtime/nmdc-runtime/site/fuseki/fuseki-config.ttl`\n", + "```ttl\n", + "@prefix afn: .\n", + "@prefix fuseki: .\n", + "@prefix ja: .\n", + "@prefix nmdc: .\n", + "@prefix owl: .\n", + "@prefix rdf: .\n", + "@prefix rdfs: .\n", + "@prefix tdb: .\n", + "@prefix xs: .\n", + "\n", + "\n", + "\ta tdb:GraphTDB ;\n", + "\ttdb:dataset ;\n", + "\t.\n", + "\n", + "\n", + "\ta ja:RDFDataset ;\n", + "\tja:defaultGraph ;\n", + "\t.\n", + "\n", + "\n", + "\ta ja:InfModel ;\n", + "\tja:baseModel ;\n", + "\tja:reasoner [\n", + "\t\tja:reasonerURL ;\n", + "\t] ;\n", + "\t.\n", + "\n", + "\n", + "\ta fuseki:Service ;\n", + "\tfuseki:dataset ;\n", + "\tfuseki:name \"nmdc\" ;\n", + "\tfuseki:serviceQuery\n", + "\t\t\"query\" ,\n", + "\t\t\"sparql\"\n", + "\t\t;\n", + "\tfuseki:serviceReadWriteGraphStore \"data\" ;\n", + "\tfuseki:serviceUpdate \"update\" ;\n", + "\tfuseki:serviceUpload \"upload\" ;\n", + "\t.\n", + "\n", + "\n", + "\ta tdb:DatasetTDB ;\n", + "\tja:context [\n", + "\t\trdfs:comment \"Query timeout on this dataset: 10s.\" ;\n", + "\t\tja:cxtName \"arq:queryTimeout\" ;\n", + "\t\tja:cxtValue \"10000\" ;\n", + "\t] ;\n", + "\ttdb:location \"/fuseki-base/nmdc-db.tdb\" ;\n", + "\t.\n", + "\n", + "[]\n", + "\ta fuseki:Server ;\n", + "\tfuseki:services (\n", + "\t\t\n", + "\t) ;\n", + "\t.\n", + "```\n", + "\n", + ". Spin up a `fuseki` container. " ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 43, "id": "ea0bdeee-6b3a-4074-bd73-cc9424569346", "metadata": {}, "outputs": [ @@ -712,14 +851,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "no such service: fuseki\n", - "Error response from daemon: No such container: fuseki\n" + "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n", + " \u001b[32m✔\u001b[0m Container fuseki \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h" ] } ], "source": [ - "!docker compose up fuseki -d\n", - "!docker exec fuseki rm -rf /fuseki-base/nmdc-db.tdb" + "!docker compose up fuseki -d" ] }, { @@ -727,16 +866,26 @@ "id": "79284de7-ef52-47c6-aeb1-1453bd4b5f59", "metadata": {}, "source": [ - "Copy data into the `fuseki` container." + "Wipe any existing persisted data, and copy new RDF data into the `fuseki` container.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "id": "9037026c-2653-43e3-bb92-2a0eea85b213", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error response from daemon: No such container: fuseki\n", + "no such directory\n" + ] + } + ], "source": [ + "!docker exec fuseki rm -rf /fuseki-base/nmdc-db.tdb\n", "!docker cp data/nmdc-db.nt.gz fuseki:/fuseki-base/" ] }, From ab15cc6be2d68232912ace895e075285ad09f9c6 Mon Sep 17 00:00:00 2001 From: Jing Date: Tue, 14 May 2024 15:52:38 -0400 Subject: [PATCH 08/14] add line to docker-compose.yml instructions --- .../notebooks/ghissue_401_sparql.ipynb | 55 ++++++++++++------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb index f2d87d3d..db64cd84 100644 --- a/metadata-translation/notebooks/ghissue_401_sparql.ipynb +++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb @@ -694,11 +694,9 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "9777151b-ddcb-472f-a71b-48af0224de53", + "cell_type": "markdown", + "id": "48e6d45e-0262-4b3c-982c-478377184c2b", "metadata": {}, - "outputs": [], "source": [ "## Load data into a dockerized fuseki server\n", "\n", @@ -711,21 +709,23 @@ " dockerfile: nmdc_runtime/fuseki.Dockerfile\n", " context: .\n", " ports:\n", - " - \"3030:3030\"\n", + " - \"3030:3030\" # modify port if you already have a service running on localhost:3030\n", " volumes:\n", " - ./nmdc_runtime/site/fuseki/fuseki-config.ttl:/configuration/fuseki-config.ttl\n", " - ./nmdc_runtime/site/fuseki/shiro.ini:/fuseki/run/shiro.ini\n", " - nmdc_runtime_fuseki_data:/fuseki-base\n", + "\n", + "volumes:\n", + " nmdc_runtime_fuseki_data:\n", + " driver: local\n", "```" ] }, { "cell_type": "markdown", - "id": "c5d8c0a2-6f75-4dac-9bb9-ac48838ad2b8", + "id": "3bbf838c-70ec-48e6-ad31-55c61f196195", "metadata": {}, "source": [ - "\n", - "\n", "2. Add the following to `/nmdc-runtime/nmdc-runtime/fuseki.Dockerfile`\n", "\n", "```Dockerfile\n", @@ -759,8 +759,14 @@ "\n", "# Command to start Fuseki server with preloaded data\n", "CMD [\"./fuseki-server\", \"--config\", \"configuration/fuseki-config.ttl\"]\n", - "```\n", - "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "b96560c3-a531-4f8f-be35-6e1a911a90ac", + "metadata": {}, + "source": [ "3. Add the following to `/nmdc-runtime/nmdc-runtime/site/fuseki/shiro.ini`\n", "```ini\n", "[main]\n", @@ -775,9 +781,15 @@ "## and the rest are restricted to localhost\n", "/$/** = anon\n", "/**=anon\n", - "```\n", - "\n", - "5. Add the following to `/nmdc-runtime/nmdc-runtime/site/fuseki/fuseki-config.ttl`\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "c5d8c0a2-6f75-4dac-9bb9-ac48838ad2b8", + "metadata": {}, + "source": [ + "4. Add the following to `/nmdc-runtime/nmdc-runtime/site/fuseki/fuseki-config.ttl`\n", "```ttl\n", "@prefix afn: .\n", "@prefix fuseki: .\n", @@ -836,9 +848,15 @@ "\t\t\n", "\t) ;\n", "\t.\n", - "```\n", - "\n", - ". Spin up a `fuseki` container. " + "```" + ] + }, + { + "cell_type": "markdown", + "id": "e1062b76-b7dc-4693-b5ad-91aa9aed490b", + "metadata": {}, + "source": [ + "5. Spin up a `fuseki` container. " ] }, { @@ -871,7 +889,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 44, "id": "9037026c-2653-43e3-bb92-2a0eea85b213", "metadata": {}, "outputs": [ @@ -879,8 +897,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Error response from daemon: No such container: fuseki\n", - "no such directory\n" + "\u001b[sPreparing to copy...\u001b[?25l\u001b[u\u001b[2KCopying to container - 0B\u001b[24G\u001b[0K14.5MB\u001b[24G\u001b[0K31.1MB\u001b[24G\u001b[0K46.9MB\u001b[24G\u001b[0K64.2MB\u001b[24G\u001b[0K81.1MB\u001b[24G\u001b[0K96.7MB\u001b[24G\u001b[0K109MB\u001b[24G\u001b[0K123MB\u001b[24G\u001b[0K139MB\u001b[24G\u001b[0K147MB\u001b[24G\u001b[0K156MB\u001b[24G\u001b[0K173MB\u001b[24G\u001b[0K190MB\u001b[24G\u001b[0K206MB\u001b[24G\u001b[0K217MB\u001b[24G\u001b[0K232MB\u001b[24G\u001b[0K247MB\u001b[24G\u001b[0K265MB\u001b[24G\u001b[0K280MB\u001b[24G\u001b[0K298MB\u001b[24G\u001b[0K312MB\u001b[24G\u001b[0K317MB\u001b[24G\u001b[0K337MB\u001b[24G\u001b[0K354MB\u001b[24G\u001b[0K373MB\u001b[24G\u001b[0K393MB\u001b[24G\u001b[0K407MB\u001b[24G\u001b[0K426MB\u001b[24G\u001b[0K442MB\u001b[24G\u001b[0K457MB\u001b[24G\u001b[0K475MB\u001b[24G\u001b[0K492MB\u001b[?25h\u001b[u\u001b[2KSuccessfully copied 502MB to fuseki:/fuseki-base/\n" ] } ], From d04a46e887fa72147ddc8b1b898b56e97b91230f Mon Sep 17 00:00:00 2001 From: Jing Date: Thu, 16 May 2024 16:03:10 -0400 Subject: [PATCH 09/14] add comments --- ...ion_referential_integrity-1715162638.ipynb | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb index b1376c81..d03f1366 100644 --- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb +++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb @@ -3,7 +3,9 @@ { "cell_type": "markdown", "id": "2a66b2dc", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "# imports" ] @@ -59,7 +61,7 @@ "id": "bcb5802b-8205-49b7-8784-dc137baff1a0", "metadata": {}, "source": [ - "# \"pre-cleaning\"" + "# Pre-cleaning" ] }, { @@ -91,33 +93,32 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "id": "b71ba7d2-ebd2-487d-a5cc-2a85ee14cb95", "metadata": {}, "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2b29e7fd07ac46a1965108fe9b1f4531", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/18 [00:00 4\u001b[0m pbar \u001b[38;5;241m=\u001b[39m \u001b[43mtqdm\u001b[49m(total\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(collection_names))\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m props:\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m coll_name \u001b[38;5;129;01min\u001b[39;00m collection_names:\n", + "\u001b[0;31mNameError\u001b[0m: name 'tqdm' is not defined" + ] } ], "source": [ "# check these slots for null values for all docs in collection_names\n", - "props = [\"used\", \"git_url\", \"was_associated_with\", \"was_generated_by\", \"compression_type\",]\n", + "props = [\"used\", \"git_url\", \"was_associated_with\", \"was_generated_by\", \"compression_type\", \n", + " \"metagenome_annotation_id\", \"metaproteomic_analysis_id\"] \n", "\n", "pbar = tqdm(total=len(collection_names))\n", "for p in props:\n", " for coll_name in collection_names:\n", " pbar.set_description(f\"checking {coll_name}...\")\n", + " # The {$type: 10} query matches for BSON Type Null, not just value `null`\n", " docs_broken = list(mdb[coll_name].find({p: {\"$type\": 10}}, [\"id\"]))\n", " if docs_broken:\n", " print(f\"removing {len(docs_broken)} null-valued {p} values for {coll_name}...\")\n", @@ -244,14 +245,22 @@ " # calculate class_hierarchy_as_list once per collection \n", " exemplar = getattr(nmdcdb, coll_name)[0]\n", " newdoc_type = class_hierarchy_as_list(exemplar)\n", + " \n", " # for each doc in collection\n", " # replace string value for 'type' with a class_hierarchy_as_list\n", " # and insert modified doc into materialized alldocs collection\n", + " \n", + " # NOTE: `type` is currently a string, does not exist for all classes, and can have typos. \n", + " # Both of these are fixed in berkeley schema but is risky to use at this time\n", + " \n", " mdb.alldocs.insert_many([assoc(dissoc(doc, 'type', '_id'), 'type', newdoc_type) for doc in mdb[coll_name].find()])\n", " pbar.update(mdb[coll_name].estimated_document_count())\n", "\n", "pbar.close()\n", - "mdb.alldocs.create_index(\"id\") # WTF... nmdc:0078a0f981ad3f92693c2bc3b6470791 prevents mdb.alldocs.create_index(\"id\", unique=True)\n", + "\n", + "# Prior to re-ID-ing, some IDs are not unique across Mongo collections (eg nmdc:0078a0f981ad3f92693c2bc3b6470791)\n", + "# Re-idx for `alldocs` collection\n", + "mdb.alldocs.create_index(\"id\")\n", "print(\"refreshed `alldocs` collection\")" ] }, From 4c298ab666f9fb800a1e50bc7f638f928f1ad334 Mon Sep 17 00:00:00 2001 From: Jing Cao Date: Thu, 16 May 2024 19:44:12 -0400 Subject: [PATCH 10/14] Update comments --- .../repl_validation_referential_integrity-1715162638.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb index d03f1366..9b53d093 100644 --- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb +++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb @@ -18,7 +18,7 @@ "Before running this notebook, make sure you have done the following:\n", "- `make up-dev` has been run and mongo is mapped to `localhost:27018`\n", "- a recent dump of the production mongo database has been loaded to `localhost:27018` (see `make mongorestore-nmdc-dev` for an example)\n", - "- .env has updated `MONGO_HOST` to `mongodb://localhost:27018`\n", + "- `.env` has updated `MONGO_HOST` to `mongodb://localhost:27018`\n", "- `export $(grep -v '^#' .env | xargs)` has been run in the shell before running `jupyter notebook`\n" ] }, From e94c0ed0f4a32d2228c210f61f62076ac064bcb1 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 27 May 2024 14:03:24 -0700 Subject: [PATCH 11/14] Clarify prose and add comments, type hints, and `TODO`s --- ...ion_referential_integrity-1715162638.ipynb | 130 ++++++++++++++---- 1 file changed, 100 insertions(+), 30 deletions(-) diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb index 9b53d093..78e0c900 100644 --- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb +++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb @@ -7,19 +7,35 @@ "jp-MarkdownHeadingCollapsed": true }, "source": [ - "# imports" + "# Referential integrity checker (prototype)" ] }, { "cell_type": "markdown", - "id": "f52d1cd4-ca97-4f43-8923-a10847e86d4b", + "id": "c892eac06fb1a86a", "metadata": {}, "source": [ + "## Prerequisites\n", + "\n", "Before running this notebook, make sure you have done the following:\n", - "- `make up-dev` has been run and mongo is mapped to `localhost:27018`\n", - "- a recent dump of the production mongo database has been loaded to `localhost:27018` (see `make mongorestore-nmdc-dev` for an example)\n", - "- `.env` has updated `MONGO_HOST` to `mongodb://localhost:27018`\n", - "- `export $(grep -v '^#' .env | xargs)` has been run in the shell before running `jupyter notebook`\n" + "\n", + "1. Run `$ make up-dev`\n", + "2. Map `localhost:27018` to the Mongo server you want to use\n", + "3. Load a recent dump of the production Mongo database into that Mongo server (see `$ make mongorestore-nmdc-dev` for an example)\n", + "4. In the `.env` file, set `MONGO_HOST` to `mongodb://localhost:27018`\n", + "5. Run `$ export $(grep -v '^#' .env | xargs)` to load the environment variables defined in `.env` into your shell environment\n", + "\n", + "Once you've done all of those things, you can run this notebook (e.g. via `$ jupyter notebook`) \n" + ] + }, + { + "cell_type": "markdown", + "id": "8f03ce22", + "metadata": {}, + "source": [ + "## Enable automatic reloading of modules\n", + "\n", + "Reference: https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html#autoreload" ] }, { @@ -35,6 +51,14 @@ "%autoreload 2" ] }, + { + "cell_type": "markdown", + "id": "5121e612", + "metadata": {}, + "source": [ + "## Import Python modules" + ] + }, { "cell_type": "code", "execution_count": 2, @@ -61,7 +85,7 @@ "id": "bcb5802b-8205-49b7-8784-dc137baff1a0", "metadata": {}, "source": [ - "# Pre-cleaning" + "## \"Pre-clean\" the data" ] }, { @@ -69,7 +93,11 @@ "id": "8ecb1950-eaec-469c-b7ac-949650825093", "metadata": {}, "source": [ - "Only consider populated collections with `id` field." + "Determine the name of each Mongo collection in which at least one document has a field named `id`.\n", + "\n", + "> **TODO:** Documents in the [`functional_annotation_agg` collection](https://microbiomedata.github.io/nmdc-schema/FunctionalAnnotationAggMember/) do not have a field named `id`, and so will not be included here. Document the author's rationale for omitting it.\n", + "\n", + "> **TODO:** The `nmdc_schema_collection_names` function combines the collection names in Mongo with the Database slots in the schema, and then omits some collection names. Document why the author took that approach." ] }, { @@ -88,7 +116,11 @@ "id": "cddaaa54-262d-4549-a9a9-4c280a6a6341", "metadata": {}, "source": [ - "Remove null-valued optional properties" + "### Remove fields that contain null\n", + "\n", + "Remove specific fields from specific documents in the above collections, if the field's name appears in our hard-coded list (see the cell below for the list) and — in that document — the field consists of a null value.\n", + "\n", + "> **TODO:** Document how the author obtained this list and whether the list would require maintenance over time." ] }, { @@ -134,7 +166,7 @@ "id": "21c2f771-b8da-466a-90e8-2c17ac5e6388", "metadata": {}, "source": [ - "# materialize single-collection db view" + "## Materialize single-collection view of database" ] }, { @@ -142,7 +174,9 @@ "id": "56d6c224-ec80-4ac9-9dcf-bf04b33a61f9", "metadata": {}, "source": [ - "Check assumption that every populated collection currently has documents of one type only." + "Check assumption that every populated collection currently has documents of one type only.\n", + "\n", + "> **TODO:** The \"class_names\" part of the `collection_name_to_class_names` dictionary does not list _descendant_ classes, even though the schema will allow instances of descendant classes to reside in those collections. Document why disregarding descendant classes here is OK." ] }, { @@ -161,7 +195,7 @@ "id": "5ed95ee0-03b7-4dff-80e7-92a2b24bccf4", "metadata": {}, "source": [ - "Define a helper function that takes a document and returns its class and all parent classes as a list" + "Define a helper function that takes a class instance and returns a list of the names of its own class and its ancestor classes." ] }, { @@ -171,20 +205,29 @@ "metadata": {}, "outputs": [], "source": [ - "def class_hierarchy_as_list(obj):\n", + "def class_hierarchy_as_list(obj) -> list[str]:\n", + " r\"\"\"\n", + " Returns a list consisting of the name of the class of the instance pass in,\n", + " and the names of all of its ancestor classes.\n", + "\n", + " TODO: Consider renaming function to be a verb; e.g. `get_class_hierarchy_as_list`.\n", + "\n", + " TODO: Document the purpose of the `rv` list (does not seem to be used anywhere).\n", + " \"\"\"\n", + "\n", " rv = []\n", " current_class = obj.__class__\n", " \n", " def recurse_through_bases(cls):\n", " name = cls.__name__\n", - " if name == \"YAMLRoot\":\n", + " if name == \"YAMLRoot\": # base case\n", " return rv\n", " rv.append(name)\n", " for base in cls.__bases__:\n", - " recurse_through_bases(base)\n", + " recurse_through_bases(base) # recursive invocation\n", " return rv\n", " \n", - " return recurse_through_bases(current_class)" + " return recurse_through_bases(current_class) # initial invocation" ] }, { @@ -192,12 +235,14 @@ "id": "b962e3c8-a346-49c5-8470-915f3cf9eb07", "metadata": {}, "source": [ - "Materialize `alldocs` collection, associating all inherited classes with document via `type` field." + "Materialize `alldocs` collection, associating all inherited classes with document via `type` field.\n", + "\n", + "> **TODO:** Clarify the above sentence." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "b2e618f3-78b9-42b6-8ea9-63d080b1b0f6", "metadata": { "scrolled": true @@ -226,29 +271,37 @@ } ], "source": [ - "# drop any previously generated alldocs collection\n", + "# Drop any existing `alldocs` collection (e.g. from previous use of this notebook).\n", "mdb.alldocs.drop()\n", "\n", - "# progress bar set-up\n", + "# Set up progress bar\n", "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n", "pbar = tqdm(total=n_docs_total)\n", "\n", "# for each collection name\n", "for coll_name in collection_names:\n", " pbar.set_description(f\"processing {coll_name}...\")\n", - " # for each doc in collection dissociate mongo-generated '_id' field\n", + " # for each doc in collection, remove the mongo-generated '_id' field\n", " try:\n", " nmdcdb = NMDCDatabase(**{coll_name: [dissoc(mdb[coll_name].find_one(), '_id')]})\n", " except ValueError as e:\n", " print(f\"no {coll_name}!\")\n", " raise e\n", - " # calculate class_hierarchy_as_list once per collection \n", - " exemplar = getattr(nmdcdb, coll_name)[0]\n", - " newdoc_type = class_hierarchy_as_list(exemplar)\n", + "\n", + " # Calculate class_hierarchy_as_list once per collection.\n", + " #\n", + " # Note: This seems to assume that the class hierarchy is identical for each document\n", + " # in a given collection, which may not be the case since a collection whose\n", + " # range is a \"parent\" class can store instances of descendant classes (and the\n", + " # class hierarchy of the latter would differ from that of the former).\n", + " #\n", + " exemplar = getattr(nmdcdb, coll_name)[0] # get first instance (i.e. document) in list\n", + " newdoc_type: list[str] = class_hierarchy_as_list(exemplar)\n", " \n", - " # for each doc in collection\n", - " # replace string value for 'type' with a class_hierarchy_as_list\n", - " # and insert modified doc into materialized alldocs collection\n", + " # For each document in this collection, replace the value of the `type` field with\n", + " # a _list_ of the document's own class and ancestor classes, remove the `_id` field,\n", + " # and insert the resulting document into the `alldocs` collection. Note that we are not\n", + " # relying on the original value of the `type` field, since it's unreliable (see below).\n", " \n", " # NOTE: `type` is currently a string, does not exist for all classes, and can have typos. \n", " # Both of these are fixed in berkeley schema but is risky to use at this time\n", @@ -264,12 +317,20 @@ "print(\"refreshed `alldocs` collection\")" ] }, + { + "cell_type": "markdown", + "id": "f0569fde", + "metadata": {}, + "source": [ + "The resulting `alldocs` collection contains a copy of every document from every Mongo collection identified earlier. The copy is the same as the original document, except that its `type` field contains a list of the names of its own class and all of its ancestor classes (whereas, the original document's `type` field contains an unreliable string)." + ] + }, { "cell_type": "markdown", "id": "ca194c0f-7417-41d2-bea8-a5a54392fee6", "metadata": {}, "source": [ - "# Validation" + "## Validate" ] }, { @@ -277,7 +338,9 @@ "id": "ab859bb2-808c-48e2-8412-d8a3a79ca4e8", "metadata": {}, "source": [ - "Collect \"top level\" (nmdc:Database slot range) classes." + "Collect \"top level\" (`nmdc:Database` slot range) classes.\n", + "\n", + "Reference: https://linkml.io/linkml/developers/schemaview.html#linkml_runtime.utils.schemaview.SchemaView.class_ancestors" ] }, { @@ -330,6 +393,12 @@ "nmdc_view = nmdc_schema_view()\n", "toplevel_classes = set()\n", "for name in nmdc_database_collection_instance_class_names():\n", + " # TODO: Document why class _ancestors_ are being included here.\n", + " # A (hypothetical) collection whose range is \"Chihuahua\" wouldn't\n", + " # be allowed to store non-\"Chihuahua\" instances of \"Dog\" or \"Animal\".\n", + " #\n", + " # Note: `a |= b` is same as `a = a | b` (union two sets and store the result).\n", + " #\n", " toplevel_classes |= set(nmdc_view.class_ancestors(name))\n", "\n", "toplevel_classes" @@ -340,7 +409,8 @@ "id": "8645690e-7a9d-4f1e-8e62-0cbdde825890", "metadata": {}, "source": [ - "## Referential integrity checking:\n", + "### Check referential integrity\n", + "\n", "- \"naive\" errors collected in `not_found` list\n", "- (hierarchy-aware) type errors (doc found, but of invalid type) collected in `invalid_type` list" ] From b7e4455a5ed6ebec86890a1e00642afff11b064e Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 27 May 2024 16:15:10 -0700 Subject: [PATCH 12/14] Add `TODO` about omitting irrelevant fields from `alldocs` collection --- .../repl_validation_referential_integrity-1715162638.ipynb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb index 78e0c900..c25d8d7e 100644 --- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb +++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb @@ -305,6 +305,9 @@ " \n", " # NOTE: `type` is currently a string, does not exist for all classes, and can have typos. \n", " # Both of these are fixed in berkeley schema but is risky to use at this time\n", + "\n", + " # TODO: Consider omitting fields that neither (a) are the `id` field, nor (b) have the potential\n", + " # to reference a document. Those fields aren't related to referential integrity.\n", " \n", " mdb.alldocs.insert_many([assoc(dissoc(doc, 'type', '_id'), 'type', newdoc_type) for doc in mdb[coll_name].find()])\n", " pbar.update(mdb[coll_name].estimated_document_count())\n", From f607d9e865c8ee93c899d0fd5b38227bee124ac1 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 27 May 2024 22:15:43 -0700 Subject: [PATCH 13/14] Add comments and prose to final two sections of notebook --- ...ion_referential_integrity-1715162638.ipynb | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb index c25d8d7e..e7349ce3 100644 --- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb +++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb @@ -414,8 +414,12 @@ "source": [ "### Check referential integrity\n", "\n", - "- \"naive\" errors collected in `not_found` list\n", - "- (hierarchy-aware) type errors (doc found, but of invalid type) collected in `invalid_type` list" + "In this cell, we populate two lists:\n", + "\n", + "- `errors.not_found`: a list of \"naive\" errors\n", + "- `errors.invalid_type`: a list of (hierarchy-aware) type errors (document was found, but is of an invalid type)\n", + "\n", + "Reference: https://linkml.io/linkml/developers/schemaview.html#linkml_runtime.utils.schemaview.SchemaView.class_induced_slots" ] }, { @@ -442,13 +446,25 @@ } ], "source": [ + "# Initialize error lists.\n", "errors = {\"not_found\": [], \"invalid_type\": []}\n", "\n", + "# Initialize progress bar.\n", + "#\n", + "# TODO: Explain why the author has opted to count the documents in the original collections,\n", + "# even though the `alldocs` collection exists now.\n", + "#\n", "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n", "pbar = tqdm(total=n_docs_total)\n", "\n", + "# Iterate over each collection.\n", "for name in sorted(collection_names):\n", + " # Note: We already confirmed (in a different cell of this notebook)\n", + " # that each `class_names` list has exactly one item.\n", " cls_name = collection_name_to_class_names[name][0]\n", + " # Make a dictionary of slot names to slot definitions. The set of slots here is (to quote the\n", + " # LinkML SchemaView documentation) \"all slots that are asserted or inferred for [the] class,\n", + " # with their inferred semantics.\"\n", " slot_map = {\n", " slot.name: slot\n", " for slot in nmdc_view.class_induced_slots(cls_name)\n", From 096b88e364ed79d9c84c10e174d56cccff2b3eae Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 27 May 2024 22:20:36 -0700 Subject: [PATCH 14/14] Add comments and prose to final two sections of notebook (for reals) --- ...ion_referential_integrity-1715162638.ipynb | 47 +++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb index e7349ce3..6d46e46b 100644 --- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb +++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb @@ -451,17 +451,18 @@ "\n", "# Initialize progress bar.\n", "#\n", - "# TODO: Explain why the author has opted to count the documents in the original collections,\n", - "# even though the `alldocs` collection exists now.\n", + "# TODO: Explain why the author has opted to count (and then—later—iterate over) the documents\n", + "# in the original collections, even though the `alldocs` collection exists already.\n", "#\n", "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n", "pbar = tqdm(total=n_docs_total)\n", "\n", - "# Iterate over each collection.\n", + "# Iterate over each collection name.\n", "for name in sorted(collection_names):\n", " # Note: We already confirmed (in a different cell of this notebook)\n", " # that each `class_names` list has exactly one item.\n", " cls_name = collection_name_to_class_names[name][0]\n", + " \n", " # Make a dictionary of slot names to slot definitions. The set of slots here is (to quote the\n", " # LinkML SchemaView documentation) \"all slots that are asserted or inferred for [the] class,\n", " # with their inferred semantics.\"\n", @@ -470,8 +471,12 @@ " for slot in nmdc_view.class_induced_slots(cls_name)\n", " }\n", " pbar.set_description(f\"processing {name}...\")\n", + " \n", + " # Iterate over each document (as a dictionary) in this collection.\n", " for doc in mdb[name].find():\n", " doc = dissoc(doc, \"_id\")\n", + "\n", + " # Iterate over each key/value pair in the dictionary (document).\n", " for field, value in doc.items():\n", " assert field in slot_map, f\"{name} doc {doc['id']}: field {field} not a valid slot\"\n", " slot_range = str(slot_map[field].range)\n", @@ -494,7 +499,9 @@ "id": "9d2ce4a3-fb33-4b47-9c7f-a7919405ab65", "metadata": {}, "source": [ - "## Results" + "## Results\n", + "\n", + "Display the number errors in each list." ] }, { @@ -518,6 +525,14 @@ "len(errors[\"not_found\"]), len(errors[\"invalid_type\"])" ] }, + { + "cell_type": "markdown", + "id": "54a560df", + "metadata": {}, + "source": [ + "Display a few errors from one of the lists, as an example." + ] + }, { "cell_type": "code", "execution_count": 16, @@ -543,6 +558,14 @@ "errors[\"not_found\"][:5]" ] }, + { + "cell_type": "markdown", + "id": "c55c7524", + "metadata": {}, + "source": [ + "Spot check one of those errors." + ] + }, { "cell_type": "code", "execution_count": 17, @@ -564,6 +587,14 @@ "mdb.alldocs.find_one({\"id\": \"nmdc:mga0vx38\"}) is None" ] }, + { + "cell_type": "markdown", + "id": "2bd191cd", + "metadata": {}, + "source": [ + "Display a few errors from the other one of the lists, as an example." + ] + }, { "cell_type": "code", "execution_count": 18, @@ -589,6 +620,14 @@ "errors[\"invalid_type\"][:5]" ] }, + { + "cell_type": "markdown", + "id": "d4abec53", + "metadata": {}, + "source": [ + "Spot check one of those errors." + ] + }, { "cell_type": "code", "execution_count": 19,