diff --git a/.gitignore b/.gitignore index ed102ea7..a15a9316 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,10 @@ MANIFEST pip-log.txt pip-delete-this-directory.txt +# mongo-restore +*.tar +*.agz + # Unit test / coverage reports htmlcov/ .tox/ @@ -55,6 +59,8 @@ coverage.xml *.mo *.pot + + # Django stuff: *.log local_settings.py @@ -103,6 +109,7 @@ celerybeat.pid # Environments .env +.env.localhost .venv env/ venv/ diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb new file mode 100644 index 00000000..db64cd84 --- /dev/null +++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb @@ -0,0 +1,1005 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "51ea05af-7579-43ad-aa9c-3bf8b6da8fdb", + "metadata": {}, + "source": [ + "# Pipeline to transform the set of nmdc-schema-compliant mongodb collections to an RDF dataset amenable to SPARQL queries." + ] + }, + { + "cell_type": "markdown", + "id": "0675b9ba-c8be-478a-8c72-6edf10f56d8b", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Before running this notebook, make sure you have done the following:\n", + "- `make up-dev` has been run and mongo is mapped to `localhost:27018`\n", + "- a recent dump of the production mongo database has been loaded to `localhost:27018` (see `make mongorestore-nmdc-dev` for an example)\n", + "- .env has updated `MONGO_HOST` to `mongodb://localhost:27018`\n", + "- `export $(grep -v '^#' .env | xargs)` has been run in the shell before running `jupyter notebook`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a362b42f-7ae0-40cf-91d4-8f19ca1087cf", + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure code changes in this notebook will be import-able without needing to restart the kernel and lose state\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "id": "3a456470-920d-4fd4-8040-e0bd3dcabff0", + "metadata": {}, + "source": [ + "Connect to local dockerized dev environment." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "55932d03-802f-4efe-bceb-e1036cd35567", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MONGO_HOST=mongodb://localhost:27018\n" + ] + } + ], + "source": [ + "!env | grep MONGO_HOST" + ] + }, + { + "cell_type": "markdown", + "id": "3a146763-f03a-4d65-baa0-81ca15cba689", + "metadata": {}, + "source": [ + "Initialize a db connection." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "edb1bb42-005c-49ca-ba59-18c24833f93f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "success\n" + ] + } + ], + "source": [ + "from nmdc_runtime.api.db.mongo import get_mongo_db\n", + "mdb = get_mongo_db()\n", + "print(\"success\")" + ] + }, + { + "cell_type": "markdown", + "id": "37dbc9a8-8cac-4798-8d4f-ccbd9c3560e9", + "metadata": {}, + "source": [ + "Get all populated nmdc-schema collections with entity `id`s." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3a0dd489-74cc-47c4-b3e0-c97dd88f5b5f", + "metadata": {}, + "outputs": [], + "source": [ + "from nmdc_runtime.util import schema_collection_names_with_id_field\n", + "\n", + "populated_collections = sorted([\n", + " name for name in set(schema_collection_names_with_id_field()) & set(mdb.list_collection_names())\n", + " if mdb[name].estimated_document_count() > 0\n", + "])" + ] + }, + { + "cell_type": "markdown", + "id": "f9a45de7-ba27-4b18-8ff4-9ba44eeb1091", + "metadata": {}, + "source": [ + "## Get a JSON-LD context for the NMDC Schema, to serialize documents to RDF" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9ed72826-b552-4429-8ab5-9f7126821822", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import json\n", + "from pprint import pprint\n", + "\n", + "from linkml.generators.jsonldcontextgen import ContextGenerator\n", + "from nmdc_schema.nmdc_data import get_nmdc_schema_definition\n", + "\n", + "context = ContextGenerator(get_nmdc_schema_definition())\n", + "context = json.loads(context.serialize())[\"@context\"]\n", + "\n", + "for k, v in list(context.items()):\n", + " if isinstance(v, dict): #and v.get(\"@type\") == \"@id\":\n", + " v.pop(\"@id\", None) # use nmdc uri, not e.g. MIXS uri" + ] + }, + { + "cell_type": "markdown", + "id": "0800c5b9-d09e-4be1-899d-62fcf40a2c0e", + "metadata": {}, + "source": [ + "Ensure `nmdc:type` has a `URIRef` range, i.e. `nmdc:type a owl:ObjectProperty`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "62a68c07-0706-4300-a48d-0ab628af87b1", + "metadata": {}, + "outputs": [], + "source": [ + "context['type'] = {'@type': '@id'}" + ] + }, + { + "cell_type": "markdown", + "id": "63fe4d54-0a41-4170-9310-45e5f47a6cb5", + "metadata": {}, + "source": [ + "## Initialize an in-memory graph to store triples, prior to serializing to disk" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "648b4f70-34d6-4c70-8d0a-ef76e7e5d96d", + "metadata": {}, + "outputs": [], + "source": [ + "from rdflib import Graph\n", + "\n", + "g = Graph()" + ] + }, + { + "cell_type": "markdown", + "id": "05cb8fd0-b847-49fc-a472-a8df2426168a", + "metadata": {}, + "source": [ + "Define a helper function to speed up triplification process." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "4d802017-2a7e-4614-b662-6a0cc027b8bc", + "metadata": {}, + "outputs": [], + "source": [ + "def split_chunk(seq, n: int):\n", + " \"\"\"\n", + " Split sequence into chunks of length n. Do not pad last chunk.\n", + " \n", + " >>> list(split_chunk(list(range(10)), 3))\n", + " [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]\n", + " \"\"\"\n", + " for i in range(0, len(seq), n):\n", + " yield seq[i : i + n]" + ] + }, + { + "cell_type": "markdown", + "id": "dfd91d37-b1c7-46ab-b30d-de80132ec091", + "metadata": {}, + "source": [ + "Define a helper function to ensure each doc has exactly one type." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "86ff7261-e255-415d-a589-67637292dbdd", + "metadata": {}, + "outputs": [], + "source": [ + "from nmdc_runtime.util import collection_name_to_class_names\n", + "\n", + "def ensure_type(doc, collection_name):\n", + " if \"type\" in doc:\n", + " return doc\n", + "\n", + " class_names = collection_name_to_class_names[collection_name]\n", + " \n", + " if len(class_names) > 1:\n", + " raise Exception(\"cannot unambiguously infer class of document\")\n", + " \n", + " return assoc(doc, \"type\", class_names[0])" + ] + }, + { + "cell_type": "markdown", + "id": "7eedd442-0f26-4829-a878-cf066b3a3912", + "metadata": {}, + "source": [ + "## Ingest mongo docs to in-memory graph \n", + "Uses `rdflib` JSON-LD parsing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4251e0b1-35dc-4f40-91e7-b9bc0d9d79e1", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d99c33f951874aea9a4f325086bde0d0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/124 [00:00 Activity : was_generated_by\n", + "Activity --> Activity : was_informed_by\n", + "FunctionalAnnotationAggMember --> WorkflowExecutionActivity : metagenome_annotation_id\n", + "NamedThing --> NamedThing : has_output\n", + "NamedThing --> NamedThing : part_of\n", + "Biosample --> FieldResearchSite : collected_from\n", + "NamedThing --> NamedThing : has_input\n", + "\n", + "MaterialEntity <|-- FieldResearchSite\n", + "Activity <|-- MetaproteomicsAnalysisActivity\n", + "NamedThing <|-- Site\n", + "NamedThing <|-- DataObject\n", + "NamedThing <|-- FieldResearchSite\n", + "MaterialEntity <|-- Site\n", + "Activity <|-- MetatranscriptomeActivity\n", + "NamedThing <|-- LibraryPreparation\n", + "WorkflowExecutionActivity <|-- MagsAnalysisActivity\n", + "NamedThing <|-- PlannedProcess\n", + "WorkflowExecutionActivity <|-- ReadBasedTaxonomyAnalysisActivity\n", + "Activity <|-- MetagenomeAssembly\n", + "WorkflowExecutionActivity <|-- NomAnalysisActivity\n", + "PlannedProcess <|-- Extraction\n", + "PlannedProcess <|-- LibraryPreparation\n", + "PlannedProcess <|-- Pooling\n", + "MaterialEntity <|-- ProcessedSample\n", + "BiosampleProcessing <|-- LibraryPreparation\n", + "NamedThing <|-- Biosample\n", + "NamedThing <|-- Pooling\n", + "NamedThing <|-- Extraction\n", + "Activity <|-- MagsAnalysisActivity\n", + "NamedThing <|-- MaterialEntity\n", + "MaterialEntity <|-- Biosample\n", + "WorkflowExecutionActivity <|-- ReadQcAnalysisActivity\n", + "NamedThing <|-- ProcessedSample\n", + "WorkflowExecutionActivity <|-- MetagenomeAnnotationActivity\n", + "NamedThing <|-- CollectingBiosamplesFromSite\n", + "NamedThing <|-- BiosampleProcessing\n", + "Activity <|-- NomAnalysisActivity\n", + "WorkflowExecutionActivity <|-- MetagenomeSequencingActivity\n", + "WorkflowExecutionActivity <|-- MetagenomeAssembly\n", + "WorkflowExecutionActivity <|-- MetatranscriptomeActivity\n", + "Activity <|-- ReadBasedTaxonomyAnalysisActivity\n", + "Activity <|-- MetagenomeAnnotationActivity\n", + "Activity <|-- WorkflowExecutionActivity\n", + "Site <|-- FieldResearchSite\n", + "BiosampleProcessing <|-- Pooling\n", + "PlannedProcess <|-- CollectingBiosamplesFromSite\n", + "Activity <|-- MetagenomeSequencingActivity\n", + "PlannedProcess <|-- BiosampleProcessing\n", + "WorkflowExecutionActivity <|-- MetabolomicsAnalysisActivity\n", + "WorkflowExecutionActivity <|-- MetaproteomicsAnalysisActivity\n", + "NamedThing <|-- OmicsProcessing\n", + "Activity <|-- MetabolomicsAnalysisActivity\n", + "NamedThing <|-- Study\n", + "Activity <|-- ReadQcAnalysisActivity\n", + "PlannedProcess <|-- OmicsProcessing\n" + ] + } + ], + "source": [ + "print(\"classDiagram\\n\")\n", + "for slot_name in toplevel_entity_connectors:\n", + " slot = slots[slot_name]\n", + " domain = slot.domain or \"NamedThing\"\n", + " range = slot.range\n", + " print(f\"{domain} --> {range} : {slot_name}\")\n", + "\n", + "print()\n", + "\n", + "inheritance_links = set()\n", + "for cls in toplevel_classes:\n", + " ancestors = schema_view.class_ancestors(cls)\n", + " for a in ancestors:\n", + " if a != cls:\n", + " inheritance_links.add(f\"{a} <|-- {cls}\")\n", + "\n", + "for link in inheritance_links:\n", + " print(link)" + ] + }, + { + "cell_type": "markdown", + "id": "63cb2cc8-ef99-4d5f-9ddf-9eb2949e9c06", + "metadata": {}, + "source": [ + "### Assert a common `depends_on` relation for all entities connected by `toplevel_entity_connectors`\n", + "This allows us to traverse the graph of top-level entities without needing to specify any specific slot names." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "cc830d77-5ac2-482e-a4f9-dc2eed3f2ef9", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d7bb9d2404eb41159d8d03d895fa66ed", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/15851994 [00:00 .\n", + "@prefix fuseki: .\n", + "@prefix ja: .\n", + "@prefix nmdc: .\n", + "@prefix owl: .\n", + "@prefix rdf: .\n", + "@prefix rdfs: .\n", + "@prefix tdb: .\n", + "@prefix xs: .\n", + "\n", + "\n", + "\ta tdb:GraphTDB ;\n", + "\ttdb:dataset ;\n", + "\t.\n", + "\n", + "\n", + "\ta ja:RDFDataset ;\n", + "\tja:defaultGraph ;\n", + "\t.\n", + "\n", + "\n", + "\ta ja:InfModel ;\n", + "\tja:baseModel ;\n", + "\tja:reasoner [\n", + "\t\tja:reasonerURL ;\n", + "\t] ;\n", + "\t.\n", + "\n", + "\n", + "\ta fuseki:Service ;\n", + "\tfuseki:dataset ;\n", + "\tfuseki:name \"nmdc\" ;\n", + "\tfuseki:serviceQuery\n", + "\t\t\"query\" ,\n", + "\t\t\"sparql\"\n", + "\t\t;\n", + "\tfuseki:serviceReadWriteGraphStore \"data\" ;\n", + "\tfuseki:serviceUpdate \"update\" ;\n", + "\tfuseki:serviceUpload \"upload\" ;\n", + "\t.\n", + "\n", + "\n", + "\ta tdb:DatasetTDB ;\n", + "\tja:context [\n", + "\t\trdfs:comment \"Query timeout on this dataset: 10s.\" ;\n", + "\t\tja:cxtName \"arq:queryTimeout\" ;\n", + "\t\tja:cxtValue \"10000\" ;\n", + "\t] ;\n", + "\ttdb:location \"/fuseki-base/nmdc-db.tdb\" ;\n", + "\t.\n", + "\n", + "[]\n", + "\ta fuseki:Server ;\n", + "\tfuseki:services (\n", + "\t\t\n", + "\t) ;\n", + "\t.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "e1062b76-b7dc-4693-b5ad-91aa9aed490b", + "metadata": {}, + "source": [ + "5. Spin up a `fuseki` container. " + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "ea0bdeee-6b3a-4074-bd73-cc9424569346", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n", + " \u001b[32m✔\u001b[0m Container fuseki \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "!docker compose up fuseki -d" + ] + }, + { + "cell_type": "markdown", + "id": "79284de7-ef52-47c6-aeb1-1453bd4b5f59", + "metadata": {}, + "source": [ + "Wipe any existing persisted data, and copy new RDF data into the `fuseki` container.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "9037026c-2653-43e3-bb92-2a0eea85b213", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[sPreparing to copy...\u001b[?25l\u001b[u\u001b[2KCopying to container - 0B\u001b[24G\u001b[0K14.5MB\u001b[24G\u001b[0K31.1MB\u001b[24G\u001b[0K46.9MB\u001b[24G\u001b[0K64.2MB\u001b[24G\u001b[0K81.1MB\u001b[24G\u001b[0K96.7MB\u001b[24G\u001b[0K109MB\u001b[24G\u001b[0K123MB\u001b[24G\u001b[0K139MB\u001b[24G\u001b[0K147MB\u001b[24G\u001b[0K156MB\u001b[24G\u001b[0K173MB\u001b[24G\u001b[0K190MB\u001b[24G\u001b[0K206MB\u001b[24G\u001b[0K217MB\u001b[24G\u001b[0K232MB\u001b[24G\u001b[0K247MB\u001b[24G\u001b[0K265MB\u001b[24G\u001b[0K280MB\u001b[24G\u001b[0K298MB\u001b[24G\u001b[0K312MB\u001b[24G\u001b[0K317MB\u001b[24G\u001b[0K337MB\u001b[24G\u001b[0K354MB\u001b[24G\u001b[0K373MB\u001b[24G\u001b[0K393MB\u001b[24G\u001b[0K407MB\u001b[24G\u001b[0K426MB\u001b[24G\u001b[0K442MB\u001b[24G\u001b[0K457MB\u001b[24G\u001b[0K475MB\u001b[24G\u001b[0K492MB\u001b[?25h\u001b[u\u001b[2KSuccessfully copied 502MB to fuseki:/fuseki-base/\n" + ] + } + ], + "source": [ + "!docker exec fuseki rm -rf /fuseki-base/nmdc-db.tdb\n", + "!docker cp data/nmdc-db.nt.gz fuseki:/fuseki-base/" + ] + }, + { + "cell_type": "markdown", + "id": "4dca86f8-6752-4aba-8d3c-656810f3af3f", + "metadata": {}, + "source": [ + "Take server down in order to bulk-load data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16f0621c-cf98-4a27-9165-7a0a8711db77", + "metadata": {}, + "outputs": [], + "source": [ + "!docker compose down fuseki" + ] + }, + { + "cell_type": "markdown", + "id": "fa4f9843-d5c0-4f8d-bcaf-ad2cf50c0264", + "metadata": {}, + "source": [ + "Bulk-load data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a490caff-af8a-4537-8c0b-e4a4752645bc", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!docker compose run fuseki ./apache-jena-4.9.0/bin/tdbloader --loc /fuseki-base/nmdc-db.tdb /fuseki-base/nmdc-db.nt.gz" + ] + }, + { + "cell_type": "markdown", + "id": "69d0e50c-102a-4a8e-9bcd-ef23600afd66", + "metadata": {}, + "source": [ + "Start up server." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a0bfb4b-e694-40b1-88af-4446e3fcc888", + "metadata": {}, + "outputs": [], + "source": [ + "!docker compose up fuseki -d" + ] + }, + { + "cell_type": "markdown", + "id": "8e528d6a-76b1-4629-82a1-58793ad6a481", + "metadata": {}, + "source": [ + "Now go to and SPARQL it up." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8695001d-9722-48a0-98e8-9ac5000551ea", + "metadata": {}, + "outputs": [], + "source": [ + "# 2024-03-14T09:40 : took <4min to run all the above." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb new file mode 100644 index 00000000..6d46e46b --- /dev/null +++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb @@ -0,0 +1,686 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2a66b2dc", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "# Referential integrity checker (prototype)" + ] + }, + { + "cell_type": "markdown", + "id": "c892eac06fb1a86a", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "Before running this notebook, make sure you have done the following:\n", + "\n", + "1. Run `$ make up-dev`\n", + "2. Map `localhost:27018` to the Mongo server you want to use\n", + "3. Load a recent dump of the production Mongo database into that Mongo server (see `$ make mongorestore-nmdc-dev` for an example)\n", + "4. In the `.env` file, set `MONGO_HOST` to `mongodb://localhost:27018`\n", + "5. Run `$ export $(grep -v '^#' .env | xargs)` to load the environment variables defined in `.env` into your shell environment\n", + "\n", + "Once you've done all of those things, you can run this notebook (e.g. via `$ jupyter notebook`) \n" + ] + }, + { + "cell_type": "markdown", + "id": "8f03ce22", + "metadata": {}, + "source": [ + "## Enable automatic reloading of modules\n", + "\n", + "Reference: https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html#autoreload" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f1c8bdb5", + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure code changes in this notebook will be import-able \n", + "# without needing to restart the kernel and lose state\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "id": "5121e612", + "metadata": {}, + "source": [ + "## Import Python modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f7ff0664-1881-4eca-b018-4c5856dc2489", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from linkml_runtime.utils.schemaview import SchemaView\n", + "from toolz import dissoc, assoc\n", + "from tqdm.notebook import tqdm\n", + "\n", + "from nmdc_runtime.api.db.mongo import get_mongo_db, nmdc_schema_collection_names\n", + "from nmdc_runtime.util import collection_name_to_class_names, nmdc_schema_view, nmdc_database_collection_instance_class_names\n", + "from nmdc_schema.nmdc_schema_accepting_legacy_ids import Database as NMDCDatabase\n", + "from nmdc_schema.get_nmdc_view import ViewGetter\n", + "\n", + "mdb = get_mongo_db()" + ] + }, + { + "cell_type": "markdown", + "id": "bcb5802b-8205-49b7-8784-dc137baff1a0", + "metadata": {}, + "source": [ + "## \"Pre-clean\" the data" + ] + }, + { + "cell_type": "markdown", + "id": "8ecb1950-eaec-469c-b7ac-949650825093", + "metadata": {}, + "source": [ + "Determine the name of each Mongo collection in which at least one document has a field named `id`.\n", + "\n", + "> **TODO:** Documents in the [`functional_annotation_agg` collection](https://microbiomedata.github.io/nmdc-schema/FunctionalAnnotationAggMember/) do not have a field named `id`, and so will not be included here. Document the author's rationale for omitting it.\n", + "\n", + "> **TODO:** The `nmdc_schema_collection_names` function combines the collection names in Mongo with the Database slots in the schema, and then omits some collection names. Document why the author took that approach." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "dde4c77e-5e06-4751-930a-95906cdf89c5", + "metadata": {}, + "outputs": [], + "source": [ + "collection_names = sorted(nmdc_schema_collection_names(mdb))\n", + "collection_names = [n for n in collection_names if mdb[n].find_one({\"id\": {\"$exists\": True}})]" + ] + }, + { + "cell_type": "markdown", + "id": "cddaaa54-262d-4549-a9a9-4c280a6a6341", + "metadata": {}, + "source": [ + "### Remove fields that contain null\n", + "\n", + "Remove specific fields from specific documents in the above collections, if the field's name appears in our hard-coded list (see the cell below for the list) and — in that document — the field consists of a null value.\n", + "\n", + "> **TODO:** Document how the author obtained this list and whether the list would require maintenance over time." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b71ba7d2-ebd2-487d-a5cc-2a85ee14cb95", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'tqdm' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# check these slots for null values for all docs in collection_names\u001b[39;00m\n\u001b[1;32m 2\u001b[0m props \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mused\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgit_url\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwas_associated_with\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwas_generated_by\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompression_type\u001b[39m\u001b[38;5;124m\"\u001b[39m, \n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetagenome_annotation_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetaproteomic_analysis_id\u001b[39m\u001b[38;5;124m\"\u001b[39m] \n\u001b[0;32m----> 4\u001b[0m pbar \u001b[38;5;241m=\u001b[39m \u001b[43mtqdm\u001b[49m(total\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(collection_names))\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m props:\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m coll_name \u001b[38;5;129;01min\u001b[39;00m collection_names:\n", + "\u001b[0;31mNameError\u001b[0m: name 'tqdm' is not defined" + ] + } + ], + "source": [ + "# check these slots for null values for all docs in collection_names\n", + "props = [\"used\", \"git_url\", \"was_associated_with\", \"was_generated_by\", \"compression_type\", \n", + " \"metagenome_annotation_id\", \"metaproteomic_analysis_id\"] \n", + "\n", + "pbar = tqdm(total=len(collection_names))\n", + "for p in props:\n", + " for coll_name in collection_names:\n", + " pbar.set_description(f\"checking {coll_name}...\")\n", + " # The {$type: 10} query matches for BSON Type Null, not just value `null`\n", + " docs_broken = list(mdb[coll_name].find({p: {\"$type\": 10}}, [\"id\"]))\n", + " if docs_broken:\n", + " print(f\"removing {len(docs_broken)} null-valued {p} values for {coll_name}...\")\n", + " mdb[coll_name].update_many(\n", + " {\"id\": {\"$in\": [d[\"id\"] for d in docs_broken]}},\n", + " {\"$unset\": {p: None}}\n", + " )\n", + " pbar.update(1)" + ] + }, + { + "cell_type": "markdown", + "id": "21c2f771-b8da-466a-90e8-2c17ac5e6388", + "metadata": {}, + "source": [ + "## Materialize single-collection view of database" + ] + }, + { + "cell_type": "markdown", + "id": "56d6c224-ec80-4ac9-9dcf-bf04b33a61f9", + "metadata": {}, + "source": [ + "Check assumption that every populated collection currently has documents of one type only.\n", + "\n", + "> **TODO:** The \"class_names\" part of the `collection_name_to_class_names` dictionary does not list _descendant_ classes, even though the schema will allow instances of descendant classes to reside in those collections. Document why disregarding descendant classes here is OK." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "59176b24-2854-4387-891f-a6be2ceca4f2", + "metadata": {}, + "outputs": [], + "source": [ + "for name in collection_names:\n", + " assert len(collection_name_to_class_names[name]) == 1" + ] + }, + { + "cell_type": "markdown", + "id": "5ed95ee0-03b7-4dff-80e7-92a2b24bccf4", + "metadata": {}, + "source": [ + "Define a helper function that takes a class instance and returns a list of the names of its own class and its ancestor classes." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4470c52a-81e4-4511-b549-768c04c3b45d", + "metadata": {}, + "outputs": [], + "source": [ + "def class_hierarchy_as_list(obj) -> list[str]:\n", + " r\"\"\"\n", + " Returns a list consisting of the name of the class of the instance pass in,\n", + " and the names of all of its ancestor classes.\n", + "\n", + " TODO: Consider renaming function to be a verb; e.g. `get_class_hierarchy_as_list`.\n", + "\n", + " TODO: Document the purpose of the `rv` list (does not seem to be used anywhere).\n", + " \"\"\"\n", + "\n", + " rv = []\n", + " current_class = obj.__class__\n", + " \n", + " def recurse_through_bases(cls):\n", + " name = cls.__name__\n", + " if name == \"YAMLRoot\": # base case\n", + " return rv\n", + " rv.append(name)\n", + " for base in cls.__bases__:\n", + " recurse_through_bases(base) # recursive invocation\n", + " return rv\n", + " \n", + " return recurse_through_bases(current_class) # initial invocation" + ] + }, + { + "cell_type": "markdown", + "id": "b962e3c8-a346-49c5-8470-915f3cf9eb07", + "metadata": {}, + "source": [ + "Materialize `alldocs` collection, associating all inherited classes with document via `type` field.\n", + "\n", + "> **TODO:** Clarify the above sentence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2e618f3-78b9-42b6-8ea9-63d080b1b0f6", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e69c4fd820114e33b11ebae47f9f3e4d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/224995 [00:00