From a12d84391893faccce628b5ae7d47d79b854a52d Mon Sep 17 00:00:00 2001
From: Jing <jing@MacBook-Pro.local>
Date: Sat, 11 May 2024 14:19:26 -0400
Subject: [PATCH 01/14] add notesbooks for mongo validation and RDF gen

---
 .../notebooks/ghissue_401_sparql.ipynb        | 623 ++++++++++++++++++
 ...ion_referential_integrity-1715162638.ipynb | 491 ++++++++++++++
 2 files changed, 1114 insertions(+)
 create mode 100644 metadata-translation/notebooks/ghissue_401_sparql.ipynb
 create mode 100644 metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb

diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb
new file mode 100644
index 00000000..bb7d6b60
--- /dev/null
+++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb
@@ -0,0 +1,623 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "51ea05af-7579-43ad-aa9c-3bf8b6da8fdb",
+   "metadata": {},
+   "source": [
+    "# Pipeline to transform the set of nmdc-schema-compliant mongodb collections to an RDF dataset amenable to SPARQL queries."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae2673a5-560b-47b0-9608-656aa3854466",
+   "metadata": {},
+   "source": [
+    "Ensure that changes to the code will be import-able in this notebook without needing restart the kernel and thus lose state."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a362b42f-7ae0-40cf-91d4-8f19ca1087cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b8b1fb7-2357-46ef-8d86-69cd1dce228d",
+   "metadata": {},
+   "source": [
+    "Connect to local dockerized dev environment."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "55932d03-802f-4efe-bceb-e1036cd35567",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MONGO_HOST=mongodb://localhost:27018\n"
+     ]
+    }
+   ],
+   "source": [
+    "!env | grep MONGO_HOST"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a146763-f03a-4d65-baa0-81ca15cba689",
+   "metadata": {},
+   "source": [
+    "Initialize a db connection."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "bc72113f-5044-4646-a273-0692d2e650ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "edb1bb42-005c-49ca-ba59-18c24833f93f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mongodb://localhost:27018\n",
+      "success\n"
+     ]
+    }
+   ],
+   "source": [
+    "from nmdc_runtime.api.db.mongo import get_mongo_db\n",
+    "print(os.getenv(\"MONGO_HOST\"))\n",
+    "# start 12:23\n",
+    "mdb = get_mongo_db()\n",
+    "print(\"success\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "114d9ffa-a22a-48de-9001-d04cbab175eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from unittest.mock import patch\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "37dbc9a8-8cac-4798-8d4f-ccbd9c3560e9",
+   "metadata": {},
+   "source": [
+    "Get all populated nmdc-schema collections with entity `id`s."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a0dd489-74cc-47c4-b3e0-c97dd88f5b5f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nmdc_runtime.util import schema_collection_names_with_id_field\n",
+    "\n",
+    "populated_collections = sorted([\n",
+    "    name for name in set(schema_collection_names_with_id_field()) & set(mdb.list_collection_names())\n",
+    "    if mdb[name].estimated_document_count() > 0\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f9a45de7-ba27-4b18-8ff4-9ba44eeb1091",
+   "metadata": {},
+   "source": [
+    "Get a JSON-LD context for the NMDC Schema, to serialize documents to RDF."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ed72826-b552-4429-8ab5-9f7126821822",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from pprint import pprint\n",
+    "\n",
+    "from linkml.generators.jsonldcontextgen import ContextGenerator\n",
+    "from nmdc_schema.nmdc_data import get_nmdc_schema_definition\n",
+    "\n",
+    "context = ContextGenerator(get_nmdc_schema_definition())\n",
+    "context = json.loads(context.serialize())[\"@context\"]\n",
+    "\n",
+    "for k, v in list(context.items()):\n",
+    "    if isinstance(v, dict): #and v.get(\"@type\") == \"@id\":\n",
+    "        v.pop(\"@id\", None) # use nmdc uri, not e.g. MIXS uri"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0800c5b9-d09e-4be1-899d-62fcf40a2c0e",
+   "metadata": {},
+   "source": [
+    "Ensure `nmdc:type` has a `URIRef` range, i.e. `nmdc:type a owl:ObjectProperty`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "62a68c07-0706-4300-a48d-0ab628af87b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "context['type'] = {'@type': '@id'}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "63fe4d54-0a41-4170-9310-45e5f47a6cb5",
+   "metadata": {},
+   "source": [
+    "Initialize an in-memory graph to store triples, prior to serializing to disk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "648b4f70-34d6-4c70-8d0a-ef76e7e5d96d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from rdflib import Graph\n",
+    "\n",
+    "g = Graph()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "05cb8fd0-b847-49fc-a472-a8df2426168a",
+   "metadata": {},
+   "source": [
+    "Define a helper function to speed up triplification process."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d802017-2a7e-4614-b662-6a0cc027b8bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def split_chunk(seq, n: int):\n",
+    "    \"\"\"\n",
+    "    Split sequence into chunks of length n. Do not pad last chunk.\n",
+    "    \n",
+    "    >>> list(split_chunk(list(range(10)), 3))\n",
+    "    [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]\n",
+    "    \"\"\"\n",
+    "    for i in range(0, len(seq), n):\n",
+    "        yield seq[i : i + n]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dfd91d37-b1c7-46ab-b30d-de80132ec091",
+   "metadata": {},
+   "source": [
+    "Use `rdflib` JSON-LD parsing to ingest mongo docs to in-memory graph."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "86ff7261-e255-415d-a589-67637292dbdd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nmdc_runtime.util import collection_name_to_class_names\n",
+    "\n",
+    "def ensure_type(doc, collection_name):\n",
+    "    if \"type\" in doc:\n",
+    "        return doc\n",
+    "\n",
+    "    class_names = collection_name_to_class_names[collection_name]\n",
+    "    if len(class_names) > 1:\n",
+    "        raise Exception(\"cannot unambiguously infer class of document\")\n",
+    "    return assoc(doc, \"type\", class_names[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4251e0b1-35dc-4f40-91e7-b9bc0d9d79e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from toolz import assoc, dissoc\n",
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "chunk_size = 2_000\n",
+    "total = sum((1 + mdb[name].estimated_document_count() // 2_000) for name in populated_collections)\n",
+    "\n",
+    "pbar = tqdm(total=total)\n",
+    "\n",
+    "for collection_name in populated_collections:\n",
+    "    print(collection_name)\n",
+    "    docs = [dissoc(doc, \"_id\") for doc in mdb[collection_name].find()]\n",
+    "    chunks = list(split_chunk(docs, chunk_size))\n",
+    "    for chunk in chunks:\n",
+    "        typed_chunk = [ensure_type(doc, collection_name) for doc in chunk]\n",
+    "        doc_jsonld = {\"@context\": context, \"@graph\": chunk}\n",
+    "        g.parse(data=json.dumps(doc_jsonld), format='json-ld')\n",
+    "        pbar.update(1)\n",
+    "print(f\"{len(g):,} triples loaded\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7140ef42-f94c-45c5-a0c1-31b05718aa4f",
+   "metadata": {},
+   "source": [
+    "Correct crazy URIs that end with newlines, which messes up graph serialization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba832848-2cc9-4d1d-bf5f-966a73e26658",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from rdflib import Namespace, RDF, Literal, URIRef\n",
+    "\n",
+    "NMDC = Namespace(\"https://w3id.org/nmdc/\")\n",
+    "\n",
+    "for s, p, o in tqdm(g, total=len(g)):\n",
+    "    s_str = str(s)\n",
+    "    if s_str.endswith(\"\\n\"):\n",
+    "        s_str_fixed = str(s_str)[:-2]\n",
+    "        g.remove((s,p,o))\n",
+    "        g.add((URIRef(s_str_fixed), p,o))\n",
+    "    if isinstance(o, URIRef):\n",
+    "        o_str = str(o)\n",
+    "        if o_str.endswith(\"\\n\"):\n",
+    "            o_str_fixed = str(o_str)[:-2]\n",
+    "            g.remove((s,p,o))\n",
+    "            g.add((s, p, URIRef(o_str_fixed)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "71893efc-8e19-465e-a33d-3fe6ee475e05",
+   "metadata": {},
+   "source": [
+    "Given a schema-collection entity (i.e. one with an `id` and its own mongo document), we want to easily find all other schema-collection entities to which it connects, via any slot.\n",
+    "\n",
+    "To do this, we first gather all schema classes that are the type of a schema-collection entity, as well as these class' ancestors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "831cbf19-8331-4f2d-814c-89d86d060029",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from linkml_runtime.utils.schemaview import SchemaView\n",
+    "\n",
+    "from nmdc_runtime.util import nmdc_schema_view, nmdc_database_collection_instance_class_names\n",
+    "\n",
+    "schema_view = nmdc_schema_view()\n",
+    "toplevel_classes = set()\n",
+    "for name in nmdc_database_collection_instance_class_names():\n",
+    "    toplevel_classes |= set(schema_view.class_ancestors(name))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "acdc7a8c-a104-4ac4-b105-0daeaba598a4",
+   "metadata": {},
+   "source": [
+    "Next, we determine which slots have such a \"top-level\" class as its range."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d402b739-4ab8-4d93-b00f-76f677313c66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "slots = schema_view.all_slots()\n",
+    "\n",
+    "toplevel_entity_connectors = set()\n",
+    "for k, v in context.items():\n",
+    "    if isinstance(v, dict) and \"@type\" in v and v[\"@type\"] == \"@id\":\n",
+    "        if slots[k].range in toplevel_classes and slots[k].domain != \"Database\":\n",
+    "            toplevel_entity_connectors.add(k)\n",
+    "print(toplevel_entity_connectors)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "40e58127-013e-40e2-a839-c9317e14c488",
+   "metadata": {},
+   "source": [
+    "Let's construct an entity-relationship diagram to visualize relationships."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c99cdd8d-5fd2-44eb-9090-af6f51770fbd",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# print(\"classDiagram\\n\")\n",
+    "# for slot_name in toplevel_entity_connectors:\n",
+    "#     slot = slots[slot_name]\n",
+    "#     domain = slot.domain or \"NamedThing\"\n",
+    "#     range = slot.range\n",
+    "#     print(f\"{domain} --> {range} : {slot_name}\")\n",
+    "\n",
+    "# print()\n",
+    "\n",
+    "# inheritance_links = set()\n",
+    "# for cls in toplevel_classes:\n",
+    "#     ancestors = schema_view.class_ancestors(cls)\n",
+    "#     for a in ancestors:\n",
+    "#         if a != cls:\n",
+    "#             inheritance_links.add(f\"{a} <|-- {cls}\")\n",
+    "\n",
+    "# for link in inheritance_links:\n",
+    "#     print(link)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "63cb2cc8-ef99-4d5f-9ddf-9eb2949e9c06",
+   "metadata": {},
+   "source": [
+    "Now, let's assert a common `depends_on` relation for all entities connected by these slots so that we can traverse the graph of top-level entities without needing to specify any specific slot names."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc830d77-5ac2-482e-a4f9-dc2eed3f2ef9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from rdflib import PROV\n",
+    "\n",
+    "for s, p, o in tqdm(g, total=len(g)):\n",
+    "    if (connector := p.removeprefix(str(NMDC))) in toplevel_entity_connectors:\n",
+    "        if connector == \"has_output\":\n",
+    "            g.add((o, NMDC.depends_on, s))\n",
+    "        else:\n",
+    "            g.add((s, NMDC.depends_on, o))\n",
+    "\n",
+    "print(f\"{len(g):,} triples in total\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4b3dd01c-0f20-40c6-9066-793c9d33b901",
+   "metadata": {},
+   "source": [
+    "Materialize superclass relations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "75db4baf-369b-47af-974b-f5298470ad7f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "schema_view = nmdc_schema_view()\n",
+    "toplevel_classes = set()\n",
+    "for name in nmdc_database_collection_instance_class_names():\n",
+    "    toplevel_classes |= set(getattr(NMDC, a) for a in schema_view.class_ancestors(name))\n",
+    "\n",
+    "for s, p, o in tqdm(g, total=len(g)):\n",
+    "    p_localname = p.removeprefix(str(NMDC))\n",
+    "    if p_localname != \"type\":\n",
+    "        continue\n",
+    "    if o not in toplevel_classes:\n",
+    "        continue\n",
+    "    for a in schema_view.class_ancestors(o.removeprefix(str(NMDC))):\n",
+    "        g.add((s, NMDC.type, getattr(NMDC,a)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "067a53a9-9220-4ee2-bcce-12d6007dab47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len([t for t in g.subjects(NMDC.type, NMDC.Activity)])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "91171cf6-f435-4815-970f-a67f51254997",
+   "metadata": {},
+   "source": [
+    "Serialize and store as gzipped N-Triples file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "125d2ad4-8433-45d8-86c4-d6a619ea5280",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gzip\n",
+    "\n",
+    "with gzip.open('data/nmdc-db.nt.gz', 'wb') as f:\n",
+    "    f.write(g.serialize(format='nt').encode())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5d8c0a2-6f75-4dac-9bb9-ac48838ad2b8",
+   "metadata": {},
+   "source": [
+    "Wipe any existing persisted data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea0bdeee-6b3a-4074-bd73-cc9424569346",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!docker compose up fuseki -d\n",
+    "!docker exec fuseki rm -rf /fuseki-base/nmdc-db.tdb"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79284de7-ef52-47c6-aeb1-1453bd4b5f59",
+   "metadata": {},
+   "source": [
+    "Ensure data is present to load."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9037026c-2653-43e3-bb92-2a0eea85b213",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!docker cp data/nmdc-db.nt.gz fuseki:/fuseki-base/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4dca86f8-6752-4aba-8d3c-656810f3af3f",
+   "metadata": {},
+   "source": [
+    "Take server down in order to bulk-load data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16f0621c-cf98-4a27-9165-7a0a8711db77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!docker compose down fuseki"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fa4f9843-d5c0-4f8d-bcaf-ad2cf50c0264",
+   "metadata": {},
+   "source": [
+    "Bulk-load data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a490caff-af8a-4537-8c0b-e4a4752645bc",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!docker compose run fuseki ./apache-jena-4.9.0/bin/tdbloader --loc /fuseki-base/nmdc-db.tdb /fuseki-base/nmdc-db.nt.gz"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "69d0e50c-102a-4a8e-9bcd-ef23600afd66",
+   "metadata": {},
+   "source": [
+    "Start up server."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1a0bfb4b-e694-40b1-88af-4446e3fcc888",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!docker compose up fuseki -d"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e528d6a-76b1-4629-82a1-58793ad6a481",
+   "metadata": {},
+   "source": [
+    "Now go to <http://localhost:3030/#/dataset/nmdc/query> and SPARQL it up."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8695001d-9722-48a0-98e8-9ac5000551ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 2024-03-14T09:40 : took <4min to run all the above."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
new file mode 100644
index 00000000..a4dca006
--- /dev/null
+++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
@@ -0,0 +1,491 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2a66b2dc",
+   "metadata": {},
+   "source": [
+    "# imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f1c8bdb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f7ff0664-1881-4eca-b018-4c5856dc2489",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from linkml_runtime.utils.schemaview import SchemaView\n",
+    "from toolz import dissoc, assoc\n",
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "from nmdc_runtime.api.db.mongo import get_mongo_db, nmdc_schema_collection_names\n",
+    "from nmdc_runtime.util import collection_name_to_class_names, nmdc_schema_view, nmdc_database_collection_instance_class_names\n",
+    "from nmdc_schema.nmdc_schema_accepting_legacy_ids import Database as NMDCDatabase\n",
+    "from nmdc_schema.get_nmdc_view import ViewGetter\n",
+    "\n",
+    "mdb = get_mongo_db()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bcb5802b-8205-49b7-8784-dc137baff1a0",
+   "metadata": {},
+   "source": [
+    "# \"pre-cleaning\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ecb1950-eaec-469c-b7ac-949650825093",
+   "metadata": {},
+   "source": [
+    "Only consider populated collections with `id` field."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "dde4c77e-5e06-4751-930a-95906cdf89c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection_names = sorted(nmdc_schema_collection_names(mdb))\n",
+    "collection_names = [n for n in collection_names if mdb[n].find_one({\"id\": {\"$exists\": True}})]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cddaaa54-262d-4549-a9a9-4c280a6a6341",
+   "metadata": {},
+   "source": [
+    "Remove null-valued optional properties"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "b71ba7d2-ebd2-487d-a5cc-2a85ee14cb95",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7c9c772648214f1faec08df226b7b44b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/18 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "props = [\"used\", \"git_url\", \"was_associated_with\", \"was_generated_by\", \"compression_type\",]\n",
+    "\n",
+    "pbar = tqdm(total=len(collection_names))\n",
+    "for p in props:\n",
+    "    for coll_name in collection_names:\n",
+    "        pbar.set_description(f\"checking {coll_name}...\")\n",
+    "        docs_broken = list(mdb[coll_name].find({p: {\"$type\": 10}}, [\"id\"]))\n",
+    "        if docs_broken:\n",
+    "            print(f\"removing {len(docs_broken)} null-valued {p} values for {coll_name}...\")\n",
+    "            mdb[coll_name].update_many(\n",
+    "                {\"id\": {\"$in\": [d[\"id\"] for d in docs_broken]}},\n",
+    "                {\"$unset\": {p: None}}\n",
+    "            )\n",
+    "        pbar.update(1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21c2f771-b8da-466a-90e8-2c17ac5e6388",
+   "metadata": {},
+   "source": [
+    "# materialize single-collection db view"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "56d6c224-ec80-4ac9-9dcf-bf04b33a61f9",
+   "metadata": {},
+   "source": [
+    "Check assumption that every populated collection currently has documents of one type only."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "59176b24-2854-4387-891f-a6be2ceca4f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for name in collection_names:\n",
+    "    assert len(collection_name_to_class_names[name]) == 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5ed95ee0-03b7-4dff-80e7-92a2b24bccf4",
+   "metadata": {},
+   "source": [
+    "Define helper function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "4470c52a-81e4-4511-b549-768c04c3b45d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def class_hierarchy_as_list(obj):\n",
+    "    rv = []\n",
+    "    current_class = obj.__class__\n",
+    "    \n",
+    "    def recurse_through_bases(cls):\n",
+    "        name = cls.__name__\n",
+    "        if name == \"YAMLRoot\":\n",
+    "            return rv\n",
+    "        rv.append(name)\n",
+    "        for base in cls.__bases__:\n",
+    "            recurse_through_bases(base)\n",
+    "        return rv\n",
+    "    \n",
+    "    return recurse_through_bases(current_class)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b962e3c8-a346-49c5-8470-915f3cf9eb07",
+   "metadata": {},
+   "source": [
+    "Materialize `alldocs` collection, associating all inherited classes with document via `type` field."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ae8a6da2-6194-4aa5-aa36-8b21f5942b40",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'_id': ObjectId('64b59413fe178b5f0339ca41'),\n",
+       " 'end_date': '2018-05-08',\n",
+       " 'has_input': ['nmdc:procsm-11-dha8mw20'],\n",
+       " 'has_output': ['nmdc:procsm-11-xb11xa62'],\n",
+       " 'id': 'nmdc:extrp-11-k5fecy41',\n",
+       " 'processing_institution': 'Battelle',\n",
+       " 'quality_control_report': {'status': 'pass'},\n",
+       " 'start_date': '2017-06-07T20:26Z',\n",
+       " 'extraction_target': 'DNA',\n",
+       " 'input_mass': {'has_numeric_value': 0.25, 'has_unit': 'g'}}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mdb.extraction_set.estimated_document_count()\n",
+    "mdb.extraction_set.find_one()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "b2e618f3-78b9-42b6-8ea9-63d080b1b0f6",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a55f9c6b3933449897439966b9e5b1b7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/171332 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "ValueError",
+     "evalue": " Unknown argument: quality_control_report = {'status': 'pass'}",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[9], line 10\u001b[0m\n\u001b[1;32m      8\u001b[0m pbar\u001b[38;5;241m.\u001b[39mset_description(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprocessing \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcoll_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      9\u001b[0m \u001b[38;5;66;03m# try:\u001b[39;00m\n\u001b[0;32m---> 10\u001b[0m nmdcdb \u001b[38;5;241m=\u001b[39m \u001b[43mNMDCDatabase\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m{\u001b[49m\u001b[43mcoll_name\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mdissoc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmdb\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcoll_name\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfind_one\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     11\u001b[0m \u001b[38;5;66;03m# except ValueError as e:\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;66;03m#     print(f\"no {coll_name}!\")\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;66;03m#     raise e\u001b[39;00m\n\u001b[1;32m     14\u001b[0m exemplar \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(nmdcdb, coll_name)[\u001b[38;5;241m0\u001b[39m]\n",
+      "File \u001b[0;32m<string>:28\u001b[0m, in \u001b[0;36m__init__\u001b[0;34m(self, planned_process_set, functional_annotation_agg, activity_set, biosample_set, collecting_biosamples_from_site_set, data_object_set, extraction_set, field_research_site_set, functional_annotation_set, genome_feature_set, library_preparation_set, mags_activity_set, metabolomics_analysis_activity_set, metagenome_annotation_activity_set, metagenome_assembly_set, metagenome_sequencing_activity_set, metaproteomics_analysis_activity_set, metatranscriptome_activity_set, nom_analysis_activity_set, omics_processing_set, pooling_set, processed_sample_set, read_based_taxonomy_analysis_activity_set, read_qc_analysis_activity_set, study_set, **_kwargs)\u001b[0m\n",
+      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:595\u001b[0m, in \u001b[0;36mDatabase.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m    591\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_normalize_inlined_as_list(slot_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcollecting_biosamples_from_site_set\u001b[39m\u001b[38;5;124m\"\u001b[39m, slot_type\u001b[38;5;241m=\u001b[39mCollectingBiosamplesFromSite, key_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m, keyed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m    593\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_normalize_inlined_as_list(slot_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata_object_set\u001b[39m\u001b[38;5;124m\"\u001b[39m, slot_type\u001b[38;5;241m=\u001b[39mDataObject, key_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m, keyed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 595\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_normalize_inlined_as_list\u001b[49m\u001b[43m(\u001b[49m\u001b[43mslot_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mextraction_set\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mslot_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mExtraction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mid\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeyed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    597\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_normalize_inlined_as_list(slot_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfield_research_site_set\u001b[39m\u001b[38;5;124m\"\u001b[39m, slot_type\u001b[38;5;241m=\u001b[39mFieldResearchSite, key_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m, keyed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m    599\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunctional_annotation_agg, \u001b[38;5;28mlist\u001b[39m):\n",
+      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/linkml_runtime/utils/yamlutils.py:97\u001b[0m, in \u001b[0;36mYAMLRoot._normalize_inlined_as_list\u001b[0;34m(self, slot_name, slot_type, key_name, keyed)\u001b[0m\n\u001b[1;32m     96\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_normalize_inlined_as_list\u001b[39m(\u001b[38;5;28mself\u001b[39m, slot_name: \u001b[38;5;28mstr\u001b[39m, slot_type: Type, key_name: \u001b[38;5;28mstr\u001b[39m, keyed: \u001b[38;5;28mbool\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 97\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_normalize_inlined\u001b[49m\u001b[43m(\u001b[49m\u001b[43mslot_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mslot_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeyed\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/linkml_runtime/utils/yamlutils.py:182\u001b[0m, in \u001b[0;36mYAMLRoot._normalize_inlined\u001b[0;34m(self, slot_name, slot_type, key_name, keyed, is_list)\u001b[0m\n\u001b[1;32m    179\u001b[0m                 form_1(list_entry)\n\u001b[1;32m    180\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    181\u001b[0m         \u001b[38;5;66;03m# **kwargs\u001b[39;00m\n\u001b[0;32m--> 182\u001b[0m         cooked_obj \u001b[38;5;241m=\u001b[39m \u001b[43mslot_type\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mas_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlist_entry\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    183\u001b[0m         order_up(cooked_obj[key_name], cooked_obj)\n\u001b[1;32m    184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(list_entry, \u001b[38;5;28mlist\u001b[39m):\n\u001b[1;32m    185\u001b[0m     \u001b[38;5;66;03m# *args\u001b[39;00m\n",
+      "File \u001b[0;32m<string>:23\u001b[0m, in \u001b[0;36m__init__\u001b[0;34m(self, id, name, description, alternative_identifiers, designated_class, end_date, has_input, has_output, processing_institution, protocol_link, start_date, instrument_name, qc_status, qc_comment, has_failure_categorization, extractant, extraction_method, extraction_target, input_mass, volume, **_kwargs)\u001b[0m\n",
+      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:3932\u001b[0m, in \u001b[0;36mExtraction.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m   3929\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume, QuantityValue):\n\u001b[1;32m   3930\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume \u001b[38;5;241m=\u001b[39m QuantityValue(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mas_dict(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume))\n\u001b[0;32m-> 3932\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__post_init__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3933\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdesignated_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclass_class_curie)\n",
+      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:3849\u001b[0m, in \u001b[0;36mPlannedProcess.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m   3846\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m []\n\u001b[1;32m   3847\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization \u001b[38;5;241m=\u001b[39m [v \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(v, FailureCategorization) \u001b[38;5;28;01melse\u001b[39;00m FailureCategorization(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mas_dict(v)) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization]\n\u001b[0;32m-> 3849\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__post_init__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3850\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdesignated_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclass_class_curie)\n",
+      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:828\u001b[0m, in \u001b[0;36mNamedThing.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m    825\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m []\n\u001b[1;32m    826\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers \u001b[38;5;241m=\u001b[39m [v \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(v, URIorCURIE) \u001b[38;5;28;01melse\u001b[39;00m URIorCURIE(v) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers]\n\u001b[0;32m--> 828\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__post_init__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/linkml_runtime/utils/yamlutils.py:48\u001b[0m, in \u001b[0;36mYAMLRoot.__post_init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m     46\u001b[0m     v \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mrepr\u001b[39m(kwargs[k])[:\u001b[38;5;241m40\u001b[39m]\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mn\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m     47\u001b[0m     messages\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mTypedNode\u001b[38;5;241m.\u001b[39myaml_loc(k)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m Unknown argument: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mk\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mv\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 48\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(messages))\n",
+      "\u001b[0;31mValueError\u001b[0m:  Unknown argument: quality_control_report = {'status': 'pass'}"
+     ]
+    }
+   ],
+   "source": [
+    "mdb.alldocs.drop()\n",
+    "\n",
+    "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n",
+    "pbar = tqdm(total=n_docs_total)\n",
+    "\n",
+    "#- for each collection name\n",
+    "for coll_name in collection_names:\n",
+    "    pbar.set_description(f\"processing {coll_name}...\")\n",
+    "    # try:\n",
+    "    nmdcdb = NMDCDatabase(**{coll_name: [dissoc(mdb[coll_name].find_one(), '_id')]})\n",
+    "    # except ValueError as e:\n",
+    "    #     print(f\"no {coll_name}!\")\n",
+    "    #     raise e\n",
+    "    exemplar = getattr(nmdcdb, coll_name)[0]\n",
+    "    newdoc_type = class_hierarchy_as_list(exemplar)\n",
+    "    # for each doc in collection\n",
+    "    mdb.alldocs.insert_many([assoc(dissoc(doc, 'type', '_id'), 'type', newdoc_type) for doc in mdb[coll_name].find()])\n",
+    "    pbar.update(mdb[coll_name].estimated_document_count())\n",
+    "\n",
+    "pbar.close()\n",
+    "mdb.alldocs.create_index(\"id\") # WTF... nmdc:0078a0f981ad3f92693c2bc3b6470791 prevents mdb.alldocs.create_index(\"id\", unique=True)\n",
+    "print(\"refreshed `alldocs` collection\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca194c0f-7417-41d2-bea8-a5a54392fee6",
+   "metadata": {},
+   "source": [
+    "# Validation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab859bb2-808c-48e2-8412-d8a3a79ca4e8",
+   "metadata": {},
+   "source": [
+    "Collect \"top level\" (nmdc:Database slot range) classes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "a2dbaf22-46e9-4de7-8288-05bc8cd2e5f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nmdc_view = nmdc_schema_view()\n",
+    "toplevel_classes = set()\n",
+    "for name in nmdc_database_collection_instance_class_names():\n",
+    "    toplevel_classes |= set(nmdc_view.class_ancestors(name))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "969e13e0-25c0-4623-bcab-93097132924b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Activity',\n",
+       " 'Biosample',\n",
+       " 'BiosampleProcessing',\n",
+       " 'CollectingBiosamplesFromSite',\n",
+       " 'DataObject',\n",
+       " 'Extraction',\n",
+       " 'FieldResearchSite',\n",
+       " 'FunctionalAnnotation',\n",
+       " 'FunctionalAnnotationAggMember',\n",
+       " 'GenomeFeature',\n",
+       " 'LibraryPreparation',\n",
+       " 'MagsAnalysisActivity',\n",
+       " 'MaterialEntity',\n",
+       " 'MetabolomicsAnalysisActivity',\n",
+       " 'MetagenomeAnnotationActivity',\n",
+       " 'MetagenomeAssembly',\n",
+       " 'MetagenomeSequencingActivity',\n",
+       " 'MetaproteomicsAnalysisActivity',\n",
+       " 'MetatranscriptomeActivity',\n",
+       " 'NamedThing',\n",
+       " 'NomAnalysisActivity',\n",
+       " 'OmicsProcessing',\n",
+       " 'PlannedProcess',\n",
+       " 'Pooling',\n",
+       " 'ProcessedSample',\n",
+       " 'ReadBasedTaxonomyAnalysisActivity',\n",
+       " 'ReadQcAnalysisActivity',\n",
+       " 'Site',\n",
+       " 'Study',\n",
+       " 'WorkflowExecutionActivity'}"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "toplevel_classes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8645690e-7a9d-4f1e-8e62-0cbdde825890",
+   "metadata": {},
+   "source": [
+    "Referential integrity checking:\n",
+    "- \"naive\" errors collected in `not_found` list\n",
+    "- (hierarchy-aware) type errors (doc found, but of invalid type) collected in `invalid_type` list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "103d70b6-24ab-41bd-8b7f-d2faaa028bdf",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "errors = {\"not_found\": [], \"invalid_type\": []}\n",
+    "\n",
+    "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n",
+    "pbar = tqdm(total=n_docs_total)\n",
+    "\n",
+    "for name in sorted(collection_names):\n",
+    "    cls_name = collection_name_to_class_names[name][0]\n",
+    "    slot_map = {\n",
+    "        slot.name: slot\n",
+    "        for slot in nmdc_view.class_induced_slots(cls_name)\n",
+    "    }\n",
+    "    pbar.set_description(f\"processing {name}...\")\n",
+    "    for doc in mdb[name].find():\n",
+    "        doc = dissoc(doc, \"_id\")\n",
+    "        for field, value in doc.items():\n",
+    "            assert field in slot_map, f\"{name} doc {doc['id']}: field {field} not a valid slot\"\n",
+    "            slot_range = str(slot_map[field].range)\n",
+    "            assert slot_range, type(slot_range)\n",
+    "            if not slot_range in toplevel_classes:\n",
+    "                continue\n",
+    "            if not isinstance(value, list):\n",
+    "                value = [value]\n",
+    "            for v in value:\n",
+    "                if mdb.alldocs.find_one({\"id\": v}, [\"_id\"]) is None:\n",
+    "                    errors[\"not_found\"].append(f\"{name} doc {doc['id']}: field {field} referenced doc {v} not found\")\n",
+    "                elif mdb.alldocs.find_one({\"id\": v, \"type\": slot_range}, [\"_id\"]) is None:\n",
+    "                    errors[\"invalid_type\"].append(f\"{name} doc {doc['id']}: field {field} referenced doc {v} not of type {slot_range}\")\n",
+    "        pbar.update(1)\n",
+    "pbar.close()           "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e01450d1-3369-4fc5-80be-9787e00a6597",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(errors[\"not_found\"]), len(errors[\"invalid_type\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a25857f4-e26e-4896-9e5f-607e7b4bb07c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "errors[\"not_found\"][:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "855e232d-0e94-428e-96eb-0535c5135bee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mdb.alldocs.find_one({\"id\": \"nmdc:mga0vx38\"}) is None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33516e3c-f10d-4c30-942b-0d01d06082f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "errors[\"invalid_type\"][:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29ec7e82-d079-4525-bd7b-d770fd69d788",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# OmicsProcessing is not subclass of Activity (!)\n",
+    "mdb.alldocs.find_one({\"id\": \"emsl:570856\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "690ea8f8-05be-4d0a-aaa4-5c04aa4c640c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 71eec27a9baf424e23c3e4758aacc33524a22e38 Mon Sep 17 00:00:00 2001
From: Jing <jing@polyneme.xyz>
Date: Sat, 11 May 2024 14:26:54 -0400
Subject: [PATCH 02/14] add .tar .agz to gitignore

---
 .gitignore | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.gitignore b/.gitignore
index ed102ea7..a15a9316 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,6 +37,10 @@ MANIFEST
 pip-log.txt
 pip-delete-this-directory.txt
 
+# mongo-restore
+*.tar
+*.agz
+
 # Unit test / coverage reports
 htmlcov/
 .tox/
@@ -55,6 +59,8 @@ coverage.xml
 *.mo
 *.pot
 
+
+
 # Django stuff:
 *.log
 local_settings.py
@@ -103,6 +109,7 @@ celerybeat.pid
 
 # Environments
 .env
+.env.localhost
 .venv
 env/
 venv/

From 3e039733341db0a6b65497646b697fe11d64cec3 Mon Sep 17 00:00:00 2001
From: Jing <jing@polyneme.xyz>
Date: Sat, 11 May 2024 14:51:13 -0400
Subject: [PATCH 03/14] add setup instructions

---
 ...ion_referential_integrity-1715162638.ipynb | 96 ++++++++-----------
 1 file changed, 40 insertions(+), 56 deletions(-)

diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
index a4dca006..e422ca42 100644
--- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
+++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
@@ -8,6 +8,18 @@
     "# imports"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "f52d1cd4-ca97-4f43-8923-a10847e86d4b",
+   "metadata": {},
+   "source": [
+    "Before running this notebook, make sure you have done the following:\n",
+    "- `make up-dev` has been run and mongo is mapped to `localhost:27018`\n",
+    "- a recent dump of the production mongo database has been loaded to `localhost:27018` (see `make mongorestore-nmdc-dev` for an example)\n",
+    "- .env has updated `MONGO_HOST` to `mongodb://localhost:27018`\n",
+    "- `export $(grep -v '^#' .env | xargs)` has been run in the shell before running `jupyter notebook`\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -15,6 +27,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# enable automatic reloading of modules before executing code\n",
     "%load_ext autoreload\n",
     "%autoreload 2"
    ]
@@ -84,7 +97,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7c9c772648214f1faec08df226b7b44b",
+       "model_id": "2b29e7fd07ac46a1965108fe9b1f4531",
        "version_major": 2,
        "version_minor": 0
       },
@@ -181,38 +194,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "id": "ae8a6da2-6194-4aa5-aa36-8b21f5942b40",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'_id': ObjectId('64b59413fe178b5f0339ca41'),\n",
-       " 'end_date': '2018-05-08',\n",
-       " 'has_input': ['nmdc:procsm-11-dha8mw20'],\n",
-       " 'has_output': ['nmdc:procsm-11-xb11xa62'],\n",
-       " 'id': 'nmdc:extrp-11-k5fecy41',\n",
-       " 'processing_institution': 'Battelle',\n",
-       " 'quality_control_report': {'status': 'pass'},\n",
-       " 'start_date': '2017-06-07T20:26Z',\n",
-       " 'extraction_target': 'DNA',\n",
-       " 'input_mass': {'has_numeric_value': 0.25, 'has_unit': 'g'}}"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mdb.extraction_set.estimated_document_count()\n",
-    "mdb.extraction_set.find_one()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "id": "b2e618f3-78b9-42b6-8ea9-63d080b1b0f6",
    "metadata": {
     "scrolled": true
@@ -221,35 +203,22 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a55f9c6b3933449897439966b9e5b1b7",
+       "model_id": "dcc57739dcdf47058fb4246f3e929aef",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/171332 [00:00<?, ?it/s]"
+       "  0%|          | 0/224995 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
     },
     {
-     "ename": "ValueError",
-     "evalue": " Unknown argument: quality_control_report = {'status': 'pass'}",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[9], line 10\u001b[0m\n\u001b[1;32m      8\u001b[0m pbar\u001b[38;5;241m.\u001b[39mset_description(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprocessing \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcoll_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      9\u001b[0m \u001b[38;5;66;03m# try:\u001b[39;00m\n\u001b[0;32m---> 10\u001b[0m nmdcdb \u001b[38;5;241m=\u001b[39m \u001b[43mNMDCDatabase\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m{\u001b[49m\u001b[43mcoll_name\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mdissoc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmdb\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcoll_name\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfind_one\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     11\u001b[0m \u001b[38;5;66;03m# except ValueError as e:\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;66;03m#     print(f\"no {coll_name}!\")\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;66;03m#     raise e\u001b[39;00m\n\u001b[1;32m     14\u001b[0m exemplar \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(nmdcdb, coll_name)[\u001b[38;5;241m0\u001b[39m]\n",
-      "File \u001b[0;32m<string>:28\u001b[0m, in \u001b[0;36m__init__\u001b[0;34m(self, planned_process_set, functional_annotation_agg, activity_set, biosample_set, collecting_biosamples_from_site_set, data_object_set, extraction_set, field_research_site_set, functional_annotation_set, genome_feature_set, library_preparation_set, mags_activity_set, metabolomics_analysis_activity_set, metagenome_annotation_activity_set, metagenome_assembly_set, metagenome_sequencing_activity_set, metaproteomics_analysis_activity_set, metatranscriptome_activity_set, nom_analysis_activity_set, omics_processing_set, pooling_set, processed_sample_set, read_based_taxonomy_analysis_activity_set, read_qc_analysis_activity_set, study_set, **_kwargs)\u001b[0m\n",
-      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:595\u001b[0m, in \u001b[0;36mDatabase.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m    591\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_normalize_inlined_as_list(slot_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcollecting_biosamples_from_site_set\u001b[39m\u001b[38;5;124m\"\u001b[39m, slot_type\u001b[38;5;241m=\u001b[39mCollectingBiosamplesFromSite, key_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m, keyed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m    593\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_normalize_inlined_as_list(slot_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata_object_set\u001b[39m\u001b[38;5;124m\"\u001b[39m, slot_type\u001b[38;5;241m=\u001b[39mDataObject, key_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m, keyed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 595\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_normalize_inlined_as_list\u001b[49m\u001b[43m(\u001b[49m\u001b[43mslot_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mextraction_set\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mslot_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mExtraction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mid\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeyed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    597\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_normalize_inlined_as_list(slot_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfield_research_site_set\u001b[39m\u001b[38;5;124m\"\u001b[39m, slot_type\u001b[38;5;241m=\u001b[39mFieldResearchSite, key_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m, keyed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m    599\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunctional_annotation_agg, \u001b[38;5;28mlist\u001b[39m):\n",
-      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/linkml_runtime/utils/yamlutils.py:97\u001b[0m, in \u001b[0;36mYAMLRoot._normalize_inlined_as_list\u001b[0;34m(self, slot_name, slot_type, key_name, keyed)\u001b[0m\n\u001b[1;32m     96\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_normalize_inlined_as_list\u001b[39m(\u001b[38;5;28mself\u001b[39m, slot_name: \u001b[38;5;28mstr\u001b[39m, slot_type: Type, key_name: \u001b[38;5;28mstr\u001b[39m, keyed: \u001b[38;5;28mbool\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 97\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_normalize_inlined\u001b[49m\u001b[43m(\u001b[49m\u001b[43mslot_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mslot_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeyed\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/linkml_runtime/utils/yamlutils.py:182\u001b[0m, in \u001b[0;36mYAMLRoot._normalize_inlined\u001b[0;34m(self, slot_name, slot_type, key_name, keyed, is_list)\u001b[0m\n\u001b[1;32m    179\u001b[0m                 form_1(list_entry)\n\u001b[1;32m    180\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    181\u001b[0m         \u001b[38;5;66;03m# **kwargs\u001b[39;00m\n\u001b[0;32m--> 182\u001b[0m         cooked_obj \u001b[38;5;241m=\u001b[39m \u001b[43mslot_type\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mas_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlist_entry\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    183\u001b[0m         order_up(cooked_obj[key_name], cooked_obj)\n\u001b[1;32m    184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(list_entry, \u001b[38;5;28mlist\u001b[39m):\n\u001b[1;32m    185\u001b[0m     \u001b[38;5;66;03m# *args\u001b[39;00m\n",
-      "File \u001b[0;32m<string>:23\u001b[0m, in \u001b[0;36m__init__\u001b[0;34m(self, id, name, description, alternative_identifiers, designated_class, end_date, has_input, has_output, processing_institution, protocol_link, start_date, instrument_name, qc_status, qc_comment, has_failure_categorization, extractant, extraction_method, extraction_target, input_mass, volume, **_kwargs)\u001b[0m\n",
-      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:3932\u001b[0m, in \u001b[0;36mExtraction.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m   3929\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume, QuantityValue):\n\u001b[1;32m   3930\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume \u001b[38;5;241m=\u001b[39m QuantityValue(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mas_dict(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvolume))\n\u001b[0;32m-> 3932\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__post_init__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3933\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdesignated_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclass_class_curie)\n",
-      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:3849\u001b[0m, in \u001b[0;36mPlannedProcess.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m   3846\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m []\n\u001b[1;32m   3847\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization \u001b[38;5;241m=\u001b[39m [v \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(v, FailureCategorization) \u001b[38;5;28;01melse\u001b[39;00m FailureCategorization(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mas_dict(v)) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_failure_categorization]\n\u001b[0;32m-> 3849\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__post_init__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3850\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdesignated_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclass_class_curie)\n",
-      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/nmdc_schema/nmdc_schema_accepting_legacy_ids.py:828\u001b[0m, in \u001b[0;36mNamedThing.__post_init__\u001b[0;34m(self, *_, **kwargs)\u001b[0m\n\u001b[1;32m    825\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m []\n\u001b[1;32m    826\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers \u001b[38;5;241m=\u001b[39m [v \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(v, URIorCURIE) \u001b[38;5;28;01melse\u001b[39;00m URIorCURIE(v) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39malternative_identifiers]\n\u001b[0;32m--> 828\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__post_init__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/nmdc/nmdc-runtime/venv/lib/python3.10/site-packages/linkml_runtime/utils/yamlutils.py:48\u001b[0m, in \u001b[0;36mYAMLRoot.__post_init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m     46\u001b[0m     v \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mrepr\u001b[39m(kwargs[k])[:\u001b[38;5;241m40\u001b[39m]\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mn\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m     47\u001b[0m     messages\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mTypedNode\u001b[38;5;241m.\u001b[39myaml_loc(k)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m Unknown argument: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mk\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mv\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 48\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(messages))\n",
-      "\u001b[0;31mValueError\u001b[0m:  Unknown argument: quality_control_report = {'status': 'pass'}"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "refreshed `alldocs` collection\n"
      ]
     }
    ],
@@ -296,7 +265,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "a2dbaf22-46e9-4de7-8288-05bc8cd2e5f8",
    "metadata": {},
    "outputs": [],
@@ -309,7 +278,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "969e13e0-25c0-4623-bcab-93097132924b",
    "metadata": {},
    "outputs": [
@@ -348,7 +317,7 @@
        " 'WorkflowExecutionActivity'}"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -374,7 +343,22 @@
    "metadata": {
     "scrolled": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5b6ac6cb87b44c28aa65e77f28e5900f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/224995 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "errors = {\"not_found\": [], \"invalid_type\": []}\n",
     "\n",

From 5b6918239686481fc8be9cf1476acb1c911bba1a Mon Sep 17 00:00:00 2001
From: Jing <jing@polyneme.xyz>
Date: Sat, 11 May 2024 15:19:41 -0400
Subject: [PATCH 04/14] add comments and formatting

---
 ...ion_referential_integrity-1715162638.ipynb | 159 +++++++++++++-----
 1 file changed, 116 insertions(+), 43 deletions(-)

diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
index e422ca42..44e2f661 100644
--- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
+++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
@@ -110,6 +110,7 @@
     }
    ],
    "source": [
+    "# check these slots for null values for all docs in collection_names\n",
     "props = [\"used\", \"git_url\", \"was_associated_with\", \"was_generated_by\", \"compression_type\",]\n",
     "\n",
     "pbar = tqdm(total=len(collection_names))\n",
@@ -158,7 +159,7 @@
    "id": "5ed95ee0-03b7-4dff-80e7-92a2b24bccf4",
    "metadata": {},
    "source": [
-    "Define helper function."
+    "Define a helper function that takes a document and returns its class and all parent classes as a list"
    ]
   },
   {
@@ -194,7 +195,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 12,
    "id": "b2e618f3-78b9-42b6-8ea9-63d080b1b0f6",
    "metadata": {
     "scrolled": true
@@ -203,7 +204,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "dcc57739dcdf47058fb4246f3e929aef",
+       "model_id": "e69c4fd820114e33b11ebae47f9f3e4d",
        "version_major": 2,
        "version_minor": 0
       },
@@ -223,22 +224,28 @@
     }
    ],
    "source": [
+    "# drop any previously generated alldocs collection\n",
     "mdb.alldocs.drop()\n",
     "\n",
+    "# progress bar set-up\n",
     "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n",
     "pbar = tqdm(total=n_docs_total)\n",
     "\n",
-    "#- for each collection name\n",
+    "# for each collection name\n",
     "for coll_name in collection_names:\n",
     "    pbar.set_description(f\"processing {coll_name}...\")\n",
-    "    # try:\n",
-    "    nmdcdb = NMDCDatabase(**{coll_name: [dissoc(mdb[coll_name].find_one(), '_id')]})\n",
-    "    # except ValueError as e:\n",
-    "    #     print(f\"no {coll_name}!\")\n",
-    "    #     raise e\n",
+    "    # for each doc in collection dissociate mongo-generated '_id' field\n",
+    "    try:\n",
+    "        nmdcdb = NMDCDatabase(**{coll_name: [dissoc(mdb[coll_name].find_one(), '_id')]})\n",
+    "    except ValueError as e:\n",
+    "        print(f\"no {coll_name}!\")\n",
+    "        raise e\n",
+    "    # calculate class_hierarchy_as_list once per collection    \n",
     "    exemplar = getattr(nmdcdb, coll_name)[0]\n",
     "    newdoc_type = class_hierarchy_as_list(exemplar)\n",
     "    # for each doc in collection\n",
+    "    # replace string value for 'type' with a class_hierarchy_as_list\n",
+    "    # and insert modified doc into materialized alldocs collection\n",
     "    mdb.alldocs.insert_many([assoc(dissoc(doc, 'type', '_id'), 'type', newdoc_type) for doc in mdb[coll_name].find()])\n",
     "    pbar.update(mdb[coll_name].estimated_document_count())\n",
     "\n",
@@ -265,22 +272,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 14,
    "id": "a2dbaf22-46e9-4de7-8288-05bc8cd2e5f8",
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "nmdc_view = nmdc_schema_view()\n",
-    "toplevel_classes = set()\n",
-    "for name in nmdc_database_collection_instance_class_names():\n",
-    "    toplevel_classes |= set(nmdc_view.class_ancestors(name))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "969e13e0-25c0-4623-bcab-93097132924b",
-   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -317,12 +311,17 @@
        " 'WorkflowExecutionActivity'}"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "nmdc_view = nmdc_schema_view()\n",
+    "toplevel_classes = set()\n",
+    "for name in nmdc_database_collection_instance_class_names():\n",
+    "    toplevel_classes |= set(nmdc_view.class_ancestors(name))\n",
+    "\n",
     "toplevel_classes"
    ]
   },
@@ -331,7 +330,7 @@
    "id": "8645690e-7a9d-4f1e-8e62-0cbdde825890",
    "metadata": {},
    "source": [
-    "Referential integrity checking:\n",
+    "## Referential integrity checking:\n",
     "- \"naive\" errors collected in `not_found` list\n",
     "- (hierarchy-aware) type errors (doc found, but of invalid type) collected in `invalid_type` list"
    ]
@@ -391,64 +390,138 @@
     "pbar.close()           "
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "9d2ce4a3-fb33-4b47-9c7f-a7919405ab65",
+   "metadata": {},
+   "source": [
+    "## Results"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "id": "e01450d1-3369-4fc5-80be-9787e00a6597",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4857, 23503)"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "len(errors[\"not_found\"]), len(errors[\"invalid_type\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "id": "a25857f4-e26e-4896-9e5f-607e7b4bb07c",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['mags_activity_set doc nmdc:fdefb3fa15098906cf788f5cadf17bb3: field part_of referenced doc nmdc:mga0vx38 not found',\n",
+       " 'mags_activity_set doc nmdc:78f8bf24916f01d053378b1bd464cd8a: field has_input referenced doc nmdc:9003278a200d1e7921e978d4c59233c3 not found',\n",
+       " 'mags_activity_set doc nmdc:a57ecfc4dee4e6938a5517ad0961dcd8: field part_of referenced doc nmdc:mga08x19 not found',\n",
+       " 'mags_activity_set doc nmdc:3e0d8aae3b16d5bba2b3faec04391929: field part_of referenced doc nmdc:mga06z11 not found',\n",
+       " 'mags_activity_set doc nmdc:4417090e8ce0e96ff2867b85823d4b26: field part_of referenced doc nmdc:mga07m45 not found']"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "errors[\"not_found\"][:5]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "id": "855e232d-0e94-428e-96eb-0535c5135bee",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "mdb.alldocs.find_one({\"id\": \"nmdc:mga0vx38\"}) is None"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "id": "33516e3c-f10d-4c30-942b-0d01d06082f9",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['data_object_set doc emsl:output_570856: field was_generated_by referenced doc emsl:570856 not of type Activity',\n",
+       " 'data_object_set doc emsl:output_570991: field was_generated_by referenced doc emsl:570991 not of type Activity',\n",
+       " 'data_object_set doc emsl:output_570998: field was_generated_by referenced doc emsl:570998 not of type Activity',\n",
+       " 'data_object_set doc emsl:output_570855: field was_generated_by referenced doc emsl:570855 not of type Activity',\n",
+       " 'data_object_set doc emsl:output_570823: field was_generated_by referenced doc emsl:570823 not of type Activity']"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "errors[\"invalid_type\"][:5]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "id": "29ec7e82-d079-4525-bd7b-d770fd69d788",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'_id': ObjectId('663fbef9ba64633177320f59'),\n",
+       " 'id': 'emsl:570856',\n",
+       " 'name': 'Rachael_21T_04-15A_M_14Mar17_leopard_Infuse',\n",
+       " 'instrument_name': '21T Agilent',\n",
+       " 'has_input': ['emsl:2f71038a-5dd1-11ec-bf63-0242ac130002'],\n",
+       " 'has_output': ['emsl:output_570856'],\n",
+       " 'omics_type': {'has_raw_value': 'Organic Matter Characterization'},\n",
+       " 'part_of': ['gold:Gs0110138'],\n",
+       " 'description': 'High resolution MS spectra only',\n",
+       " 'processing_institution': 'EMSL',\n",
+       " 'gold_sequencing_project_identifiers': [],\n",
+       " 'type': ['OmicsProcessing', 'PlannedProcess', 'NamedThing']}"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# OmicsProcessing is not subclass of Activity (!)\n",
     "mdb.alldocs.find_one({\"id\": \"emsl:570856\"})"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "690ea8f8-05be-4d0a-aaa4-5c04aa4c640c",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

From f81dde6742b7ccfd2226b33c561aec3268101499 Mon Sep 17 00:00:00 2001
From: Jing <jing@polyneme.xyz>
Date: Tue, 14 May 2024 14:48:56 -0400
Subject: [PATCH 05/14] more comments

---
 .../notebooks/ghissue_401_sparql.ipynb        | 340 ++++++++++++++----
 ...ion_referential_integrity-1715162638.ipynb |   3 +-
 2 files changed, 270 insertions(+), 73 deletions(-)

diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb
index bb7d6b60..2a5cac60 100644
--- a/metadata-translation/notebooks/ghissue_401_sparql.ipynb
+++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb
@@ -10,10 +10,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ae2673a5-560b-47b0-9608-656aa3854466",
+   "id": "0675b9ba-c8be-478a-8c72-6edf10f56d8b",
    "metadata": {},
    "source": [
-    "Ensure that changes to the code will be import-able in this notebook without needing restart the kernel and thus lose state."
+    "## Setup\n",
+    "\n",
+    "Before running this notebook, make sure you have done the following:\n",
+    "- `make up-dev` has been run and mongo is mapped to `localhost:27018`\n",
+    "- a recent dump of the production mongo database has been loaded to `localhost:27018` (see `make mongorestore-nmdc-dev` for an example)\n",
+    "- .env has updated `MONGO_HOST` to `mongodb://localhost:27018`\n",
+    "- `export $(grep -v '^#' .env | xargs)` has been run in the shell before running `jupyter notebook`"
    ]
   },
   {
@@ -23,13 +29,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Ensure code changes in this notebook will be import-able without needing to restart the kernel and lose state\n",
     "%load_ext autoreload\n",
     "%autoreload 2"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "0b8b1fb7-2357-46ef-8d86-69cd1dce228d",
+   "id": "3a456470-920d-4fd4-8040-e0bd3dcabff0",
    "metadata": {},
    "source": [
     "Connect to local dockerized dev environment."
@@ -63,17 +70,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "bc72113f-5044-4646-a273-0692d2e650ea",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "id": "edb1bb42-005c-49ca-ba59-18c24833f93f",
    "metadata": {},
    "outputs": [
@@ -81,30 +78,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "mongodb://localhost:27018\n",
       "success\n"
      ]
     }
    ],
    "source": [
     "from nmdc_runtime.api.db.mongo import get_mongo_db\n",
-    "print(os.getenv(\"MONGO_HOST\"))\n",
-    "# start 12:23\n",
     "mdb = get_mongo_db()\n",
     "print(\"success\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "114d9ffa-a22a-48de-9001-d04cbab175eb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from unittest.mock import patch\n",
-    "\n"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "37dbc9a8-8cac-4798-8d4f-ccbd9c3560e9",
@@ -115,7 +98,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "3a0dd489-74cc-47c4-b3e0-c97dd88f5b5f",
    "metadata": {},
    "outputs": [],
@@ -133,12 +116,12 @@
    "id": "f9a45de7-ba27-4b18-8ff4-9ba44eeb1091",
    "metadata": {},
    "source": [
-    "Get a JSON-LD context for the NMDC Schema, to serialize documents to RDF."
+    "## Get a JSON-LD context for the NMDC Schema, to serialize documents to RDF"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "id": "9ed72826-b552-4429-8ab5-9f7126821822",
    "metadata": {
     "scrolled": true
@@ -169,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "id": "62a68c07-0706-4300-a48d-0ab628af87b1",
    "metadata": {},
    "outputs": [],
@@ -182,12 +165,12 @@
    "id": "63fe4d54-0a41-4170-9310-45e5f47a6cb5",
    "metadata": {},
    "source": [
-    "Initialize an in-memory graph to store triples, prior to serializing to disk."
+    "## Initialize an in-memory graph to store triples, prior to serializing to disk"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "id": "648b4f70-34d6-4c70-8d0a-ef76e7e5d96d",
    "metadata": {},
    "outputs": [],
@@ -207,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "id": "4d802017-2a7e-4614-b662-6a0cc027b8bc",
    "metadata": {},
    "outputs": [],
@@ -228,12 +211,12 @@
    "id": "dfd91d37-b1c7-46ab-b30d-de80132ec091",
    "metadata": {},
    "source": [
-    "Use `rdflib` JSON-LD parsing to ingest mongo docs to in-memory graph."
+    "Define a helper function to ensure each doc has exactly one type."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "id": "86ff7261-e255-415d-a589-67637292dbdd",
    "metadata": {},
    "outputs": [],
@@ -245,33 +228,83 @@
     "        return doc\n",
     "\n",
     "    class_names = collection_name_to_class_names[collection_name]\n",
+    "    \n",
     "    if len(class_names) > 1:\n",
     "        raise Exception(\"cannot unambiguously infer class of document\")\n",
+    "        \n",
     "    return assoc(doc, \"type\", class_names[0])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "7eedd442-0f26-4829-a878-cf066b3a3912",
+   "metadata": {},
+   "source": [
+    "## Ingest mongo docs to in-memory graph \n",
+    "Uses `rdflib` JSON-LD parsing"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "4251e0b1-35dc-4f40-91e7-b9bc0d9d79e1",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d99c33f951874aea9a4f325086bde0d0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/124 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "loading biosample_set collection\n",
+      "loading data_object_set collection\n",
+      "loading extraction_set collection\n",
+      "loading field_research_site_set collection\n",
+      "loading library_preparation_set collection\n",
+      "loading mags_activity_set collection\n",
+      "loading metabolomics_analysis_activity_set collection\n",
+      "loading metagenome_annotation_activity_set collection\n",
+      "loading metagenome_assembly_set collection\n",
+      "loading metagenome_sequencing_activity_set collection\n",
+      "loading metaproteomics_analysis_activity_set collection\n"
+     ]
+    }
+   ],
    "source": [
     "from toolz import assoc, dissoc\n",
     "from tqdm.notebook import tqdm\n",
     "\n",
     "chunk_size = 2_000\n",
-    "total = sum((1 + mdb[name].estimated_document_count() // 2_000) for name in populated_collections)\n",
     "\n",
+    "# setup for progress bar\n",
+    "total = sum((1 + mdb[name].estimated_document_count() // 2_000) for name in populated_collections)\n",
     "pbar = tqdm(total=total)\n",
     "\n",
     "for collection_name in populated_collections:\n",
-    "    print(collection_name)\n",
+    "    print(f\"loading {collection_name} collection\")\n",
+    "    # dissociate mongo-generated `_id` field\n",
     "    docs = [dissoc(doc, \"_id\") for doc in mdb[collection_name].find()]\n",
+    "    # split collection docs into chunks\n",
     "    chunks = list(split_chunk(docs, chunk_size))\n",
+    "    \n",
     "    for chunk in chunks:\n",
+    "        # ensure each doc in chunk is typed\n",
     "        typed_chunk = [ensure_type(doc, collection_name) for doc in chunk]\n",
+    "        # convert each doc to json_ld\n",
     "        doc_jsonld = {\"@context\": context, \"@graph\": chunk}\n",
+    "        # add each doc to Graph `g`\n",
     "        g.parse(data=json.dumps(doc_jsonld), format='json-ld')\n",
     "        pbar.update(1)\n",
     "print(f\"{len(g):,} triples loaded\")"
@@ -282,15 +315,30 @@
    "id": "7140ef42-f94c-45c5-a0c1-31b05718aa4f",
    "metadata": {},
    "source": [
-    "Correct crazy URIs that end with newlines, which messes up graph serialization."
+    "Correct URIs that end with newlines, which messes up graph serialization."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "id": "ba832848-2cc9-4d1d-bf5f-966a73e26658",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fe36373f43ab43fc85fa302d32fc40cb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/6348584 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "from rdflib import Namespace, RDF, Literal, URIRef\n",
     "\n",
@@ -315,6 +363,7 @@
    "id": "71893efc-8e19-465e-a33d-3fe6ee475e05",
    "metadata": {},
    "source": [
+    "## Connect Schema-Collection Entities\n",
     "Given a schema-collection entity (i.e. one with an `id` and its own mongo document), we want to easily find all other schema-collection entities to which it connects, via any slot.\n",
     "\n",
     "To do this, we first gather all schema classes that are the type of a schema-collection entity, as well as these class' ancestors."
@@ -322,7 +371,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "id": "831cbf19-8331-4f2d-814c-89d86d060029",
    "metadata": {},
    "outputs": [],
@@ -347,10 +396,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
    "id": "d402b739-4ab8-4d93-b00f-76f677313c66",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'was_generated_by', 'was_informed_by', 'metagenome_annotation_id', 'has_output', 'part_of', 'collected_from', 'has_input'}\n"
+     ]
+    }
+   ],
    "source": [
     "slots = schema_view.all_slots()\n",
     "\n",
@@ -372,31 +429,96 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
    "id": "c99cdd8d-5fd2-44eb-9090-af6f51770fbd",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "classDiagram\n",
+      "\n",
+      "NamedThing --> Activity : was_generated_by\n",
+      "Activity --> Activity : was_informed_by\n",
+      "FunctionalAnnotationAggMember --> WorkflowExecutionActivity : metagenome_annotation_id\n",
+      "NamedThing --> NamedThing : has_output\n",
+      "NamedThing --> NamedThing : part_of\n",
+      "Biosample --> FieldResearchSite : collected_from\n",
+      "NamedThing --> NamedThing : has_input\n",
+      "\n",
+      "MaterialEntity <|-- FieldResearchSite\n",
+      "Activity <|-- MetaproteomicsAnalysisActivity\n",
+      "NamedThing <|-- Site\n",
+      "NamedThing <|-- DataObject\n",
+      "NamedThing <|-- FieldResearchSite\n",
+      "MaterialEntity <|-- Site\n",
+      "Activity <|-- MetatranscriptomeActivity\n",
+      "NamedThing <|-- LibraryPreparation\n",
+      "WorkflowExecutionActivity <|-- MagsAnalysisActivity\n",
+      "NamedThing <|-- PlannedProcess\n",
+      "WorkflowExecutionActivity <|-- ReadBasedTaxonomyAnalysisActivity\n",
+      "Activity <|-- MetagenomeAssembly\n",
+      "WorkflowExecutionActivity <|-- NomAnalysisActivity\n",
+      "PlannedProcess <|-- Extraction\n",
+      "PlannedProcess <|-- LibraryPreparation\n",
+      "PlannedProcess <|-- Pooling\n",
+      "MaterialEntity <|-- ProcessedSample\n",
+      "BiosampleProcessing <|-- LibraryPreparation\n",
+      "NamedThing <|-- Biosample\n",
+      "NamedThing <|-- Pooling\n",
+      "NamedThing <|-- Extraction\n",
+      "Activity <|-- MagsAnalysisActivity\n",
+      "NamedThing <|-- MaterialEntity\n",
+      "MaterialEntity <|-- Biosample\n",
+      "WorkflowExecutionActivity <|-- ReadQcAnalysisActivity\n",
+      "NamedThing <|-- ProcessedSample\n",
+      "WorkflowExecutionActivity <|-- MetagenomeAnnotationActivity\n",
+      "NamedThing <|-- CollectingBiosamplesFromSite\n",
+      "NamedThing <|-- BiosampleProcessing\n",
+      "Activity <|-- NomAnalysisActivity\n",
+      "WorkflowExecutionActivity <|-- MetagenomeSequencingActivity\n",
+      "WorkflowExecutionActivity <|-- MetagenomeAssembly\n",
+      "WorkflowExecutionActivity <|-- MetatranscriptomeActivity\n",
+      "Activity <|-- ReadBasedTaxonomyAnalysisActivity\n",
+      "Activity <|-- MetagenomeAnnotationActivity\n",
+      "Activity <|-- WorkflowExecutionActivity\n",
+      "Site <|-- FieldResearchSite\n",
+      "BiosampleProcessing <|-- Pooling\n",
+      "PlannedProcess <|-- CollectingBiosamplesFromSite\n",
+      "Activity <|-- MetagenomeSequencingActivity\n",
+      "PlannedProcess <|-- BiosampleProcessing\n",
+      "WorkflowExecutionActivity <|-- MetabolomicsAnalysisActivity\n",
+      "WorkflowExecutionActivity <|-- MetaproteomicsAnalysisActivity\n",
+      "NamedThing <|-- OmicsProcessing\n",
+      "Activity <|-- MetabolomicsAnalysisActivity\n",
+      "NamedThing <|-- Study\n",
+      "Activity <|-- ReadQcAnalysisActivity\n",
+      "PlannedProcess <|-- OmicsProcessing\n"
+     ]
+    }
+   ],
    "source": [
-    "# print(\"classDiagram\\n\")\n",
-    "# for slot_name in toplevel_entity_connectors:\n",
-    "#     slot = slots[slot_name]\n",
-    "#     domain = slot.domain or \"NamedThing\"\n",
-    "#     range = slot.range\n",
-    "#     print(f\"{domain} --> {range} : {slot_name}\")\n",
+    "print(\"classDiagram\\n\")\n",
+    "for slot_name in toplevel_entity_connectors:\n",
+    "    slot = slots[slot_name]\n",
+    "    domain = slot.domain or \"NamedThing\"\n",
+    "    range = slot.range\n",
+    "    print(f\"{domain} --> {range} : {slot_name}\")\n",
     "\n",
-    "# print()\n",
+    "print()\n",
     "\n",
-    "# inheritance_links = set()\n",
-    "# for cls in toplevel_classes:\n",
-    "#     ancestors = schema_view.class_ancestors(cls)\n",
-    "#     for a in ancestors:\n",
-    "#         if a != cls:\n",
-    "#             inheritance_links.add(f\"{a} <|-- {cls}\")\n",
+    "inheritance_links = set()\n",
+    "for cls in toplevel_classes:\n",
+    "    ancestors = schema_view.class_ancestors(cls)\n",
+    "    for a in ancestors:\n",
+    "        if a != cls:\n",
+    "            inheritance_links.add(f\"{a} <|-- {cls}\")\n",
     "\n",
-    "# for link in inheritance_links:\n",
-    "#     print(link)"
+    "for link in inheritance_links:\n",
+    "    print(link)"
    ]
   },
   {
@@ -404,15 +526,38 @@
    "id": "63cb2cc8-ef99-4d5f-9ddf-9eb2949e9c06",
    "metadata": {},
    "source": [
-    "Now, let's assert a common `depends_on` relation for all entities connected by these slots so that we can traverse the graph of top-level entities without needing to specify any specific slot names."
+    "### Assert a common `depends_on` relation for all entities connected by `toplevel_entity_connectors`\n",
+    "This allows us to traverse the graph of top-level entities without needing to specify any specific slot names."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
    "id": "cc830d77-5ac2-482e-a4f9-dc2eed3f2ef9",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d7bb9d2404eb41159d8d03d895fa66ed",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/15851994 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "16,125,596 triples in total\n"
+     ]
+    }
+   ],
    "source": [
     "from rdflib import PROV\n",
     "\n",
@@ -431,37 +576,86 @@
    "id": "4b3dd01c-0f20-40c6-9066-793c9d33b901",
    "metadata": {},
    "source": [
-    "Materialize superclass relations."
+    "### Materialize superclass relations\n",
+    "We want each entity to be associated with its own class and all the classes that its class inherits from. For example an entity of type `Biosample` should also be of type `NamedThing`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
    "id": "75db4baf-369b-47af-974b-f5298470ad7f",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7d41e5fe31fd423cb17f9f0cca75ab72",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/16349744 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "schema_view = nmdc_schema_view()\n",
     "toplevel_classes = set()\n",
+    "\n",
+    "# get top level class names\n",
     "for name in nmdc_database_collection_instance_class_names():\n",
     "    toplevel_classes |= set(getattr(NMDC, a) for a in schema_view.class_ancestors(name))\n",
     "\n",
+    "# for each triple (s, p, o) in Graph, add all triples (s, p, o') where o' is a class ancestor of o.\n",
     "for s, p, o in tqdm(g, total=len(g)):\n",
+    "    # get the local predicate name (eg mdb slot name) for that triple\n",
     "    p_localname = p.removeprefix(str(NMDC))\n",
+    "    # skip if predicate is `type`, as this triple was already loaded \n",
     "    if p_localname != \"type\":\n",
     "        continue\n",
+    "    # skip triple if the object is not a top-level class   \n",
     "    if o not in toplevel_classes:\n",
     "        continue\n",
+    "    # for each triple where the object is a top-level class,\n",
+    "    # for each `class_ancestor` associated with that top-level class,\n",
+    "    # add the triple (s, `NMDC.type`, `class_ancestor`) \n",
     "    for a in schema_view.class_ancestors(o.removeprefix(str(NMDC))):\n",
-    "        g.add((s, NMDC.type, getattr(NMDC,a)))"
+    "        # print(f\"{a=}\")\n",
+    "        t = (s, NMDC.type, getattr(NMDC,a))\n",
+    "        # pprint(f\"{t=}\")\n",
+    "        g.add(t)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "067a53a9-9220-4ee2-bcce-12d6007dab47",
+   "id": "092657c9-864c-4978-814b-6f587e92887d",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "Sanity check that we have the right number of ActivitySet records."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "067a53a9-9220-4ee2-bcce-12d6007dab47",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "14889"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "len([t for t in g.subjects(NMDC.type, NMDC.Activity)])"
    ]
@@ -471,7 +665,7 @@
    "id": "91171cf6-f435-4815-970f-a67f51254997",
    "metadata": {},
    "source": [
-    "Serialize and store as gzipped N-Triples file."
+    "## Serialize and store as gzipped N-Triples file."
    ]
   },
   {
@@ -492,6 +686,8 @@
    "id": "c5d8c0a2-6f75-4dac-9bb9-ac48838ad2b8",
    "metadata": {},
    "source": [
+    "## Load data into a fuseki server\n",
+    "\n",
     "Wipe any existing persisted data."
    ]
   },
diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
index 44e2f661..b1376c81 100644
--- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
+++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
@@ -27,7 +27,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# enable automatic reloading of modules before executing code\n",
+    "# Ensure code changes in this notebook will be import-able  \n",
+    "# without needing to restart the kernel and lose state\n",
     "%load_ext autoreload\n",
     "%autoreload 2"
    ]

From c3a27072abcaa42f75ca1a456994ad5876471919 Mon Sep 17 00:00:00 2001
From: Jing <jing@polyneme.xyz>
Date: Tue, 14 May 2024 15:08:36 -0400
Subject: [PATCH 06/14] add comments

---
 .../notebooks/ghissue_401_sparql.ipynb        | 40 ++++++++++++++-----
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb
index 2a5cac60..26f8194b 100644
--- a/metadata-translation/notebooks/ghissue_401_sparql.ipynb
+++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb
@@ -665,20 +665,32 @@
    "id": "91171cf6-f435-4815-970f-a67f51254997",
    "metadata": {},
    "source": [
-    "## Serialize and store as gzipped N-Triples file."
+    "## Serialize and store as gzipped N-Triples file.\n",
+    "This can take a few minutes..."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 35,
    "id": "125d2ad4-8433-45d8-86c4-d6a619ea5280",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "serializing Graph and writing to file...\n",
+      "success!\n"
+     ]
+    }
+   ],
    "source": [
     "import gzip\n",
     "\n",
     "with gzip.open('data/nmdc-db.nt.gz', 'wb') as f:\n",
-    "    f.write(g.serialize(format='nt').encode())"
+    "    print(\"Serializing graph and writing to file...\") \n",
+    "    f.write(g.serialize(format='nt').encode())\n",
+    "    print(\"Success!\")"
    ]
   },
   {
@@ -686,17 +698,25 @@
    "id": "c5d8c0a2-6f75-4dac-9bb9-ac48838ad2b8",
    "metadata": {},
    "source": [
-    "## Load data into a fuseki server\n",
-    "\n",
-    "Wipe any existing persisted data."
+    "## Load data into a dockerized fuseki server\n",
+    "Spin up a `fuseki` container and wipe any existing persisted data."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 38,
    "id": "ea0bdeee-6b3a-4074-bd73-cc9424569346",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "no such service: fuseki\n",
+      "Error response from daemon: No such container: fuseki\n"
+     ]
+    }
+   ],
    "source": [
     "!docker compose up fuseki -d\n",
     "!docker exec fuseki rm -rf /fuseki-base/nmdc-db.tdb"
@@ -707,7 +727,7 @@
    "id": "79284de7-ef52-47c6-aeb1-1453bd4b5f59",
    "metadata": {},
    "source": [
-    "Ensure data is present to load."
+    "Copy data into the `fuseki` container."
    ]
   },
   {

From 267777e4eff16c8e614cb4436270fdfe840bc710 Mon Sep 17 00:00:00 2001
From: Jing <jing@polyneme.xyz>
Date: Tue, 14 May 2024 15:49:35 -0400
Subject: [PATCH 07/14] add instructions for fuseki container

---
 .../notebooks/ghissue_401_sparql.ipynb        | 169 ++++++++++++++++--
 1 file changed, 159 insertions(+), 10 deletions(-)

diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb
index 26f8194b..f2d87d3d 100644
--- a/metadata-translation/notebooks/ghissue_401_sparql.ipynb
+++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb
@@ -693,18 +693,157 @@
     "    print(\"Success!\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9777151b-ddcb-472f-a71b-48af0224de53",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Load data into a dockerized fuseki server\n",
+    "\n",
+    "1. Add the following to `/nmdc-runtime/docker-compose.yaml`.\n",
+    "\n",
+    "```yml\n",
+    "  fuseki:\n",
+    "    container_name: fuseki\n",
+    "    build:\n",
+    "      dockerfile: nmdc_runtime/fuseki.Dockerfile\n",
+    "      context: .\n",
+    "    ports:\n",
+    "      - \"3030:3030\"\n",
+    "    volumes:\n",
+    "      - ./nmdc_runtime/site/fuseki/fuseki-config.ttl:/configuration/fuseki-config.ttl\n",
+    "      - ./nmdc_runtime/site/fuseki/shiro.ini:/fuseki/run/shiro.ini\n",
+    "      - nmdc_runtime_fuseki_data:/fuseki-base\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "c5d8c0a2-6f75-4dac-9bb9-ac48838ad2b8",
    "metadata": {},
    "source": [
-    "## Load data into a dockerized fuseki server\n",
-    "Spin up a `fuseki` container and wipe any existing persisted data."
+    "\n",
+    "\n",
+    "2. Add the following to `/nmdc-runtime/nmdc-runtime/fuseki.Dockerfile`\n",
+    "\n",
+    "```Dockerfile\n",
+    "# Use an appropriate base image that includes Java and wget\n",
+    "FROM openjdk:11-jre-slim\n",
+    "\n",
+    "# Set environment variables\n",
+    "ENV FUSEKI_VERSION 4.9.0\n",
+    "ENV FUSEKI_HOME /fuseki\n",
+    "\n",
+    "# Install wget\n",
+    "RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*\n",
+    "\n",
+    "# Download and extract Fuseki\n",
+    "RUN wget -qO- https://archive.apache.org/dist/jena/binaries/apache-jena-fuseki-$FUSEKI_VERSION.tar.gz | tar xvz -C / && \\\n",
+    "    mv /apache-jena-fuseki-$FUSEKI_VERSION $FUSEKI_HOME\n",
+    "\n",
+    "# Expose the default port\n",
+    "EXPOSE 3030\n",
+    "\n",
+    "# Download and extract Jena Commands\n",
+    "RUN wget -qO- https://archive.apache.org/dist/jena/binaries/apache-jena-$FUSEKI_VERSION.tar.gz | tar xvz -C / && \\\n",
+    "    mv /apache-jena-$FUSEKI_VERSION $FUSEKI_HOME\n",
+    "\n",
+    "# Copy the Fuseki configuration file to the container\n",
+    "COPY ./nmdc_runtime/site/fuseki/fuseki-config.ttl $FUSEKI_HOME/configuration/\n",
+    "COPY ./nmdc_runtime/site/fuseki/shiro.ini $FUSEKI_HOME/run/\n",
+    "\n",
+    "# Set working directory\n",
+    "WORKDIR $FUSEKI_HOME\n",
+    "\n",
+    "# Command to start Fuseki server with preloaded data\n",
+    "CMD [\"./fuseki-server\", \"--config\", \"configuration/fuseki-config.ttl\"]\n",
+    "```\n",
+    "\n",
+    "3. Add the following to `/nmdc-runtime/nmdc-runtime/site/fuseki/shiro.ini`\n",
+    "```ini\n",
+    "[main]\n",
+    "localhost=org.apache.jena.fuseki.authz.LocalhostFilter\n",
+    "\n",
+    "[urls]\n",
+    "## Control functions open to anyone\n",
+    "/$/server = anon\n",
+    "/$/ping   = anon\n",
+    "/$/stats = anon\n",
+    "/$/stats/* = anon\n",
+    "## and the rest are restricted to localhost\n",
+    "/$/** = anon\n",
+    "/**=anon\n",
+    "```\n",
+    "\n",
+    "5. Add the following to `/nmdc-runtime/nmdc-runtime/site/fuseki/fuseki-config.ttl`\n",
+    "```ttl\n",
+    "@prefix afn: <http://jena.apache.org/ARQ/function#> .\n",
+    "@prefix fuseki: <http://jena.apache.org/fuseki#> .\n",
+    "@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .\n",
+    "@prefix nmdc: <https://w3id.org/nmdc/> .\n",
+    "@prefix owl: <http://www.w3.org/2002/07/owl#> .\n",
+    "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n",
+    "@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n",
+    "@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .\n",
+    "@prefix xs: <http://www.w3.org/2001/XMLSchema#> .\n",
+    "\n",
+    "<https://api.microbiomedata.org/fuseki/#baseModel>\n",
+    "\ta tdb:GraphTDB ;\n",
+    "\ttdb:dataset <https://api.microbiomedata.org/fuseki/#tdbDataset> ;\n",
+    "\t.\n",
+    "\n",
+    "<https://api.microbiomedata.org/fuseki/#dataset>\n",
+    "\ta ja:RDFDataset ;\n",
+    "\tja:defaultGraph <https://api.microbiomedata.org/fuseki/#inferenceModel> ;\n",
+    "\t.\n",
+    "\n",
+    "<https://api.microbiomedata.org/fuseki/#inferenceModel>\n",
+    "\ta ja:InfModel ;\n",
+    "\tja:baseModel <https://api.microbiomedata.org/fuseki/#baseModel> ;\n",
+    "\tja:reasoner [\n",
+    "\t\tja:reasonerURL <http://jena.hpl.hp.com/2003/TransitiveReasoner> ;\n",
+    "\t] ;\n",
+    "\t.\n",
+    "\n",
+    "<https://api.microbiomedata.org/fuseki/#nmdc>\n",
+    "\ta fuseki:Service ;\n",
+    "\tfuseki:dataset <https://api.microbiomedata.org/fuseki/#dataset> ;\n",
+    "\tfuseki:name \"nmdc\" ;\n",
+    "\tfuseki:serviceQuery\n",
+    "\t\t\"query\" ,\n",
+    "\t\t\"sparql\"\n",
+    "\t\t;\n",
+    "\tfuseki:serviceReadWriteGraphStore \"data\" ;\n",
+    "\tfuseki:serviceUpdate \"update\" ;\n",
+    "\tfuseki:serviceUpload \"upload\" ;\n",
+    "\t.\n",
+    "\n",
+    "<https://api.microbiomedata.org/fuseki/#tdbDataset>\n",
+    "\ta tdb:DatasetTDB ;\n",
+    "\tja:context [\n",
+    "\t\trdfs:comment \"Query timeout on this dataset: 10s.\" ;\n",
+    "\t\tja:cxtName \"arq:queryTimeout\" ;\n",
+    "\t\tja:cxtValue \"10000\" ;\n",
+    "\t] ;\n",
+    "\ttdb:location \"/fuseki-base/nmdc-db.tdb\" ;\n",
+    "\t.\n",
+    "\n",
+    "[]\n",
+    "\ta fuseki:Server ;\n",
+    "\tfuseki:services (\n",
+    "\t\t<https://api.microbiomedata.org/fuseki/#nmdc>\n",
+    "\t) ;\n",
+    "\t.\n",
+    "```\n",
+    "\n",
+    ". Spin up a `fuseki` container. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 43,
    "id": "ea0bdeee-6b3a-4074-bd73-cc9424569346",
    "metadata": {},
    "outputs": [
@@ -712,14 +851,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "no such service: fuseki\n",
-      "Error response from daemon: No such container: fuseki\n"
+      "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n",
+      " \u001b[32m✔\u001b[0m Container fuseki  \u001b[32mRunning\u001b[0m                                               \u001b[34m0.0s \u001b[0m\n",
+      "\u001b[?25h"
      ]
     }
    ],
    "source": [
-    "!docker compose up fuseki -d\n",
-    "!docker exec fuseki rm -rf /fuseki-base/nmdc-db.tdb"
+    "!docker compose up fuseki -d"
    ]
   },
   {
@@ -727,16 +866,26 @@
    "id": "79284de7-ef52-47c6-aeb1-1453bd4b5f59",
    "metadata": {},
    "source": [
-    "Copy data into the `fuseki` container."
+    "Wipe any existing persisted data, and copy new RDF data into the `fuseki` container.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 42,
    "id": "9037026c-2653-43e3-bb92-2a0eea85b213",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Error response from daemon: No such container: fuseki\n",
+      "no such directory\n"
+     ]
+    }
+   ],
    "source": [
+    "!docker exec fuseki rm -rf /fuseki-base/nmdc-db.tdb\n",
     "!docker cp data/nmdc-db.nt.gz fuseki:/fuseki-base/"
    ]
   },

From ab15cc6be2d68232912ace895e075285ad09f9c6 Mon Sep 17 00:00:00 2001
From: Jing <jing@polyneme.xyz>
Date: Tue, 14 May 2024 15:52:38 -0400
Subject: [PATCH 08/14] add line to docker-compose.yml instructions

---
 .../notebooks/ghissue_401_sparql.ipynb        | 55 ++++++++++++-------
 1 file changed, 36 insertions(+), 19 deletions(-)

diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb
index f2d87d3d..db64cd84 100644
--- a/metadata-translation/notebooks/ghissue_401_sparql.ipynb
+++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb
@@ -694,11 +694,9 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9777151b-ddcb-472f-a71b-48af0224de53",
+   "cell_type": "markdown",
+   "id": "48e6d45e-0262-4b3c-982c-478377184c2b",
    "metadata": {},
-   "outputs": [],
    "source": [
     "## Load data into a dockerized fuseki server\n",
     "\n",
@@ -711,21 +709,23 @@
     "      dockerfile: nmdc_runtime/fuseki.Dockerfile\n",
     "      context: .\n",
     "    ports:\n",
-    "      - \"3030:3030\"\n",
+    "      - \"3030:3030\" # modify port if you already have a service running on localhost:3030\n",
     "    volumes:\n",
     "      - ./nmdc_runtime/site/fuseki/fuseki-config.ttl:/configuration/fuseki-config.ttl\n",
     "      - ./nmdc_runtime/site/fuseki/shiro.ini:/fuseki/run/shiro.ini\n",
     "      - nmdc_runtime_fuseki_data:/fuseki-base\n",
+    "\n",
+    "volumes:\n",
+    "  nmdc_runtime_fuseki_data:\n",
+    "    driver: local\n",
     "```"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "c5d8c0a2-6f75-4dac-9bb9-ac48838ad2b8",
+   "id": "3bbf838c-70ec-48e6-ad31-55c61f196195",
    "metadata": {},
    "source": [
-    "\n",
-    "\n",
     "2. Add the following to `/nmdc-runtime/nmdc-runtime/fuseki.Dockerfile`\n",
     "\n",
     "```Dockerfile\n",
@@ -759,8 +759,14 @@
     "\n",
     "# Command to start Fuseki server with preloaded data\n",
     "CMD [\"./fuseki-server\", \"--config\", \"configuration/fuseki-config.ttl\"]\n",
-    "```\n",
-    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b96560c3-a531-4f8f-be35-6e1a911a90ac",
+   "metadata": {},
+   "source": [
     "3. Add the following to `/nmdc-runtime/nmdc-runtime/site/fuseki/shiro.ini`\n",
     "```ini\n",
     "[main]\n",
@@ -775,9 +781,15 @@
     "## and the rest are restricted to localhost\n",
     "/$/** = anon\n",
     "/**=anon\n",
-    "```\n",
-    "\n",
-    "5. Add the following to `/nmdc-runtime/nmdc-runtime/site/fuseki/fuseki-config.ttl`\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5d8c0a2-6f75-4dac-9bb9-ac48838ad2b8",
+   "metadata": {},
+   "source": [
+    "4. Add the following to `/nmdc-runtime/nmdc-runtime/site/fuseki/fuseki-config.ttl`\n",
     "```ttl\n",
     "@prefix afn: <http://jena.apache.org/ARQ/function#> .\n",
     "@prefix fuseki: <http://jena.apache.org/fuseki#> .\n",
@@ -836,9 +848,15 @@
     "\t\t<https://api.microbiomedata.org/fuseki/#nmdc>\n",
     "\t) ;\n",
     "\t.\n",
-    "```\n",
-    "\n",
-    ". Spin up a `fuseki` container. "
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e1062b76-b7dc-4693-b5ad-91aa9aed490b",
+   "metadata": {},
+   "source": [
+    "5. Spin up a `fuseki` container. "
    ]
   },
   {
@@ -871,7 +889,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 44,
    "id": "9037026c-2653-43e3-bb92-2a0eea85b213",
    "metadata": {},
    "outputs": [
@@ -879,8 +897,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Error response from daemon: No such container: fuseki\n",
-      "no such directory\n"
+      "\u001b[sPreparing to copy...\u001b[?25l\u001b[u\u001b[2KCopying to container - 0B\u001b[24G\u001b[0K14.5MB\u001b[24G\u001b[0K31.1MB\u001b[24G\u001b[0K46.9MB\u001b[24G\u001b[0K64.2MB\u001b[24G\u001b[0K81.1MB\u001b[24G\u001b[0K96.7MB\u001b[24G\u001b[0K109MB\u001b[24G\u001b[0K123MB\u001b[24G\u001b[0K139MB\u001b[24G\u001b[0K147MB\u001b[24G\u001b[0K156MB\u001b[24G\u001b[0K173MB\u001b[24G\u001b[0K190MB\u001b[24G\u001b[0K206MB\u001b[24G\u001b[0K217MB\u001b[24G\u001b[0K232MB\u001b[24G\u001b[0K247MB\u001b[24G\u001b[0K265MB\u001b[24G\u001b[0K280MB\u001b[24G\u001b[0K298MB\u001b[24G\u001b[0K312MB\u001b[24G\u001b[0K317MB\u001b[24G\u001b[0K337MB\u001b[24G\u001b[0K354MB\u001b[24G\u001b[0K373MB\u001b[24G\u001b[0K393MB\u001b[24G\u001b[0K407MB\u001b[24G\u001b[0K426MB\u001b[24G\u001b[0K442MB\u001b[24G\u001b[0K457MB\u001b[24G\u001b[0K475MB\u001b[24G\u001b[0K492MB\u001b[?25h\u001b[u\u001b[2KSuccessfully copied 502MB to fuseki:/fuseki-base/\n"
      ]
     }
    ],

From d04a46e887fa72147ddc8b1b898b56e97b91230f Mon Sep 17 00:00:00 2001
From: Jing <jing@polyneme.xyz>
Date: Thu, 16 May 2024 16:03:10 -0400
Subject: [PATCH 09/14] add comments

---
 ...ion_referential_integrity-1715162638.ipynb | 43 +++++++++++--------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
index b1376c81..d03f1366 100644
--- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
+++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
@@ -3,7 +3,9 @@
   {
    "cell_type": "markdown",
    "id": "2a66b2dc",
-   "metadata": {},
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
    "source": [
     "# imports"
    ]
@@ -59,7 +61,7 @@
    "id": "bcb5802b-8205-49b7-8784-dc137baff1a0",
    "metadata": {},
    "source": [
-    "# \"pre-cleaning\""
+    "# Pre-cleaning"
    ]
   },
   {
@@ -91,33 +93,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "id": "b71ba7d2-ebd2-487d-a5cc-2a85ee14cb95",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2b29e7fd07ac46a1965108fe9b1f4531",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/18 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+     "ename": "NameError",
+     "evalue": "name 'tqdm' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# check these slots for null values for all docs in collection_names\u001b[39;00m\n\u001b[1;32m      2\u001b[0m props \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mused\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgit_url\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwas_associated_with\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwas_generated_by\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompression_type\u001b[39m\u001b[38;5;124m\"\u001b[39m, \n\u001b[1;32m      3\u001b[0m          \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetagenome_annotation_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetaproteomic_analysis_id\u001b[39m\u001b[38;5;124m\"\u001b[39m] \n\u001b[0;32m----> 4\u001b[0m pbar \u001b[38;5;241m=\u001b[39m \u001b[43mtqdm\u001b[49m(total\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(collection_names))\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m props:\n\u001b[1;32m      6\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m coll_name \u001b[38;5;129;01min\u001b[39;00m collection_names:\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'tqdm' is not defined"
+     ]
     }
    ],
    "source": [
     "# check these slots for null values for all docs in collection_names\n",
-    "props = [\"used\", \"git_url\", \"was_associated_with\", \"was_generated_by\", \"compression_type\",]\n",
+    "props = [\"used\", \"git_url\", \"was_associated_with\", \"was_generated_by\", \"compression_type\", \n",
+    "         \"metagenome_annotation_id\", \"metaproteomic_analysis_id\"] \n",
     "\n",
     "pbar = tqdm(total=len(collection_names))\n",
     "for p in props:\n",
     "    for coll_name in collection_names:\n",
     "        pbar.set_description(f\"checking {coll_name}...\")\n",
+    "        # The {$type: 10} query matches for BSON Type Null, not just value `null`\n",
     "        docs_broken = list(mdb[coll_name].find({p: {\"$type\": 10}}, [\"id\"]))\n",
     "        if docs_broken:\n",
     "            print(f\"removing {len(docs_broken)} null-valued {p} values for {coll_name}...\")\n",
@@ -244,14 +245,22 @@
     "    # calculate class_hierarchy_as_list once per collection    \n",
     "    exemplar = getattr(nmdcdb, coll_name)[0]\n",
     "    newdoc_type = class_hierarchy_as_list(exemplar)\n",
+    "    \n",
     "    # for each doc in collection\n",
     "    # replace string value for 'type' with a class_hierarchy_as_list\n",
     "    # and insert modified doc into materialized alldocs collection\n",
+    "    \n",
+    "    # NOTE: `type` is currently a string, does not exist for all classes, and can have typos. \n",
+    "    # Both of these are fixed in berkeley schema but is risky to use at this time\n",
+    "    \n",
     "    mdb.alldocs.insert_many([assoc(dissoc(doc, 'type', '_id'), 'type', newdoc_type) for doc in mdb[coll_name].find()])\n",
     "    pbar.update(mdb[coll_name].estimated_document_count())\n",
     "\n",
     "pbar.close()\n",
-    "mdb.alldocs.create_index(\"id\") # WTF... nmdc:0078a0f981ad3f92693c2bc3b6470791 prevents mdb.alldocs.create_index(\"id\", unique=True)\n",
+    "\n",
+    "# Prior to re-ID-ing, some IDs are not unique across Mongo collections (eg nmdc:0078a0f981ad3f92693c2bc3b6470791)\n",
+    "# Re-idx for `alldocs` collection\n",
+    "mdb.alldocs.create_index(\"id\")\n",
     "print(\"refreshed `alldocs` collection\")"
    ]
   },

From 4c298ab666f9fb800a1e50bc7f638f928f1ad334 Mon Sep 17 00:00:00 2001
From: Jing Cao <jingcao.me@gmail.com>
Date: Thu, 16 May 2024 19:44:12 -0400
Subject: [PATCH 10/14] Update comments

---
 .../repl_validation_referential_integrity-1715162638.ipynb      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
index d03f1366..9b53d093 100644
--- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
+++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
@@ -18,7 +18,7 @@
     "Before running this notebook, make sure you have done the following:\n",
     "- `make up-dev` has been run and mongo is mapped to `localhost:27018`\n",
     "- a recent dump of the production mongo database has been loaded to `localhost:27018` (see `make mongorestore-nmdc-dev` for an example)\n",
-    "- .env has updated `MONGO_HOST` to `mongodb://localhost:27018`\n",
+    "- `.env` has updated `MONGO_HOST` to `mongodb://localhost:27018`\n",
     "- `export $(grep -v '^#' .env | xargs)` has been run in the shell before running `jupyter notebook`\n"
    ]
   },

From e94c0ed0f4a32d2228c210f61f62076ac064bcb1 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Mon, 27 May 2024 14:03:24 -0700
Subject: [PATCH 11/14] Clarify prose and add comments, type hints, and `TODO`s

---
 ...ion_referential_integrity-1715162638.ipynb | 130 ++++++++++++++----
 1 file changed, 100 insertions(+), 30 deletions(-)

diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
index 9b53d093..78e0c900 100644
--- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
+++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
@@ -7,19 +7,35 @@
     "jp-MarkdownHeadingCollapsed": true
    },
    "source": [
-    "# imports"
+    "# Referential integrity checker (prototype)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "f52d1cd4-ca97-4f43-8923-a10847e86d4b",
+   "id": "c892eac06fb1a86a",
    "metadata": {},
    "source": [
+    "## Prerequisites\n",
+    "\n",
     "Before running this notebook, make sure you have done the following:\n",
-    "- `make up-dev` has been run and mongo is mapped to `localhost:27018`\n",
-    "- a recent dump of the production mongo database has been loaded to `localhost:27018` (see `make mongorestore-nmdc-dev` for an example)\n",
-    "- `.env` has updated `MONGO_HOST` to `mongodb://localhost:27018`\n",
-    "- `export $(grep -v '^#' .env | xargs)` has been run in the shell before running `jupyter notebook`\n"
+    "\n",
+    "1. Run `$ make up-dev`\n",
+    "2. Map `localhost:27018` to the Mongo server you want to use\n",
+    "3. Load a recent dump of the production Mongo database into that Mongo server (see `$ make mongorestore-nmdc-dev` for an example)\n",
+    "4. In the `.env` file, set `MONGO_HOST` to `mongodb://localhost:27018`\n",
+    "5. Run `$ export $(grep -v '^#' .env | xargs)` to load the environment variables defined in `.env` into your shell environment\n",
+    "\n",
+    "Once you've done all of those things, you can run this notebook (e.g. via `$ jupyter notebook`) \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f03ce22",
+   "metadata": {},
+   "source": [
+    "## Enable automatic reloading of modules\n",
+    "\n",
+    "Reference: https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html#autoreload"
    ]
   },
   {
@@ -35,6 +51,14 @@
     "%autoreload 2"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "5121e612",
+   "metadata": {},
+   "source": [
+    "## Import Python modules"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -61,7 +85,7 @@
    "id": "bcb5802b-8205-49b7-8784-dc137baff1a0",
    "metadata": {},
    "source": [
-    "# Pre-cleaning"
+    "## \"Pre-clean\" the data"
    ]
   },
   {
@@ -69,7 +93,11 @@
    "id": "8ecb1950-eaec-469c-b7ac-949650825093",
    "metadata": {},
    "source": [
-    "Only consider populated collections with `id` field."
+    "Determine the name of each Mongo collection in which at least one document has a field named `id`.\n",
+    "\n",
+    "> **TODO:** Documents in the [`functional_annotation_agg` collection](https://microbiomedata.github.io/nmdc-schema/FunctionalAnnotationAggMember/) do not have a field named `id`, and so will not be included here. Document the author's rationale for omitting it.\n",
+    "\n",
+    "> **TODO:** The `nmdc_schema_collection_names` function combines the collection names in Mongo with the Database slots in the schema, and then omits some collection names. Document why the author took that approach."
    ]
   },
   {
@@ -88,7 +116,11 @@
    "id": "cddaaa54-262d-4549-a9a9-4c280a6a6341",
    "metadata": {},
    "source": [
-    "Remove null-valued optional properties"
+    "### Remove fields that contain null\n",
+    "\n",
+    "Remove specific fields from specific documents in the above collections, if the field's name appears in our hard-coded list (see the cell below for the list) and — in that document — the field consists of a null value.\n",
+    "\n",
+    "> **TODO:** Document how the author obtained this list and whether the list would require maintenance over time."
    ]
   },
   {
@@ -134,7 +166,7 @@
    "id": "21c2f771-b8da-466a-90e8-2c17ac5e6388",
    "metadata": {},
    "source": [
-    "# materialize single-collection db view"
+    "## Materialize single-collection view of database"
    ]
   },
   {
@@ -142,7 +174,9 @@
    "id": "56d6c224-ec80-4ac9-9dcf-bf04b33a61f9",
    "metadata": {},
    "source": [
-    "Check assumption that every populated collection currently has documents of one type only."
+    "Check assumption that every populated collection currently has documents of one type only.\n",
+    "\n",
+    "> **TODO:** The \"class_names\" part of the `collection_name_to_class_names` dictionary does not list _descendant_ classes, even though the schema will allow instances of descendant classes to reside in those collections. Document why disregarding descendant classes here is OK."
    ]
   },
   {
@@ -161,7 +195,7 @@
    "id": "5ed95ee0-03b7-4dff-80e7-92a2b24bccf4",
    "metadata": {},
    "source": [
-    "Define a helper function that takes a document and returns its class and all parent classes as a list"
+    "Define a helper function that takes a class instance and returns a list of the names of its own class and its ancestor classes."
    ]
   },
   {
@@ -171,20 +205,29 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def class_hierarchy_as_list(obj):\n",
+    "def class_hierarchy_as_list(obj) -> list[str]:\n",
+    "    r\"\"\"\n",
+    "    Returns a list consisting of the name of the class of the instance pass in,\n",
+    "    and the names of all of its ancestor classes.\n",
+    "\n",
+    "    TODO: Consider renaming function to be a verb; e.g. `get_class_hierarchy_as_list`.\n",
+    "\n",
+    "    TODO: Document the purpose of the `rv` list (does not seem to be used anywhere).\n",
+    "    \"\"\"\n",
+    "\n",
     "    rv = []\n",
     "    current_class = obj.__class__\n",
     "    \n",
     "    def recurse_through_bases(cls):\n",
     "        name = cls.__name__\n",
-    "        if name == \"YAMLRoot\":\n",
+    "        if name == \"YAMLRoot\":  # base case\n",
     "            return rv\n",
     "        rv.append(name)\n",
     "        for base in cls.__bases__:\n",
-    "            recurse_through_bases(base)\n",
+    "            recurse_through_bases(base)  # recursive invocation\n",
     "        return rv\n",
     "    \n",
-    "    return recurse_through_bases(current_class)"
+    "    return recurse_through_bases(current_class)  # initial invocation"
    ]
   },
   {
@@ -192,12 +235,14 @@
    "id": "b962e3c8-a346-49c5-8470-915f3cf9eb07",
    "metadata": {},
    "source": [
-    "Materialize `alldocs` collection, associating all inherited classes with document via `type` field."
+    "Materialize `alldocs` collection, associating all inherited classes with document via `type` field.\n",
+    "\n",
+    "> **TODO:** Clarify the above sentence."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "id": "b2e618f3-78b9-42b6-8ea9-63d080b1b0f6",
    "metadata": {
     "scrolled": true
@@ -226,29 +271,37 @@
     }
    ],
    "source": [
-    "# drop any previously generated alldocs collection\n",
+    "# Drop any existing `alldocs` collection (e.g. from previous use of this notebook).\n",
     "mdb.alldocs.drop()\n",
     "\n",
-    "# progress bar set-up\n",
+    "# Set up progress bar\n",
     "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n",
     "pbar = tqdm(total=n_docs_total)\n",
     "\n",
     "# for each collection name\n",
     "for coll_name in collection_names:\n",
     "    pbar.set_description(f\"processing {coll_name}...\")\n",
-    "    # for each doc in collection dissociate mongo-generated '_id' field\n",
+    "    # for each doc in collection, remove the mongo-generated '_id' field\n",
     "    try:\n",
     "        nmdcdb = NMDCDatabase(**{coll_name: [dissoc(mdb[coll_name].find_one(), '_id')]})\n",
     "    except ValueError as e:\n",
     "        print(f\"no {coll_name}!\")\n",
     "        raise e\n",
-    "    # calculate class_hierarchy_as_list once per collection    \n",
-    "    exemplar = getattr(nmdcdb, coll_name)[0]\n",
-    "    newdoc_type = class_hierarchy_as_list(exemplar)\n",
+    "\n",
+    "    # Calculate class_hierarchy_as_list once per collection.\n",
+    "    #\n",
+    "    # Note: This seems to assume that the class hierarchy is identical for each document\n",
+    "    #       in a given collection, which may not be the case since a collection whose\n",
+    "    #       range is a \"parent\" class can store instances of descendant classes (and the\n",
+    "    #       class hierarchy of the latter would differ from that of the former).\n",
+    "    #\n",
+    "    exemplar = getattr(nmdcdb, coll_name)[0]  # get first instance (i.e. document) in list\n",
+    "    newdoc_type: list[str] = class_hierarchy_as_list(exemplar)\n",
     "    \n",
-    "    # for each doc in collection\n",
-    "    # replace string value for 'type' with a class_hierarchy_as_list\n",
-    "    # and insert modified doc into materialized alldocs collection\n",
+    "    # For each document in this collection, replace the value of the `type` field with\n",
+    "    # a _list_ of the document's own class and ancestor classes, remove the `_id` field,\n",
+    "    # and insert the resulting document into the `alldocs` collection. Note that we are not\n",
+    "    # relying on the original value of the `type` field, since it's unreliable (see below).\n",
     "    \n",
     "    # NOTE: `type` is currently a string, does not exist for all classes, and can have typos. \n",
     "    # Both of these are fixed in berkeley schema but is risky to use at this time\n",
@@ -264,12 +317,20 @@
     "print(\"refreshed `alldocs` collection\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "f0569fde",
+   "metadata": {},
+   "source": [
+    "The resulting `alldocs` collection contains a copy of every document from every Mongo collection identified earlier. The copy is the same as the original document, except that its `type` field contains a list of the names of its own class and all of its ancestor classes (whereas, the original document's `type` field contains an unreliable string)."
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "ca194c0f-7417-41d2-bea8-a5a54392fee6",
    "metadata": {},
    "source": [
-    "# Validation"
+    "## Validate"
    ]
   },
   {
@@ -277,7 +338,9 @@
    "id": "ab859bb2-808c-48e2-8412-d8a3a79ca4e8",
    "metadata": {},
    "source": [
-    "Collect \"top level\" (nmdc:Database slot range) classes."
+    "Collect \"top level\" (`nmdc:Database` slot range) classes.\n",
+    "\n",
+    "Reference: https://linkml.io/linkml/developers/schemaview.html#linkml_runtime.utils.schemaview.SchemaView.class_ancestors"
    ]
   },
   {
@@ -330,6 +393,12 @@
     "nmdc_view = nmdc_schema_view()\n",
     "toplevel_classes = set()\n",
     "for name in nmdc_database_collection_instance_class_names():\n",
+    "    # TODO: Document why class _ancestors_ are being included here.\n",
+    "    #       A (hypothetical) collection whose range is \"Chihuahua\" wouldn't\n",
+    "    #       be allowed to store non-\"Chihuahua\" instances of \"Dog\" or \"Animal\".\n",
+    "    #\n",
+    "    # Note: `a |= b` is same as `a = a | b` (union two sets and store the result).\n",
+    "    #\n",
     "    toplevel_classes |= set(nmdc_view.class_ancestors(name))\n",
     "\n",
     "toplevel_classes"
@@ -340,7 +409,8 @@
    "id": "8645690e-7a9d-4f1e-8e62-0cbdde825890",
    "metadata": {},
    "source": [
-    "## Referential integrity checking:\n",
+    "### Check referential integrity\n",
+    "\n",
     "- \"naive\" errors collected in `not_found` list\n",
     "- (hierarchy-aware) type errors (doc found, but of invalid type) collected in `invalid_type` list"
    ]

From b7e4455a5ed6ebec86890a1e00642afff11b064e Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Mon, 27 May 2024 16:15:10 -0700
Subject: [PATCH 12/14] Add `TODO` about omitting irrelevant fields from
 `alldocs` collection

---
 .../repl_validation_referential_integrity-1715162638.ipynb     | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
index 78e0c900..c25d8d7e 100644
--- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
+++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
@@ -305,6 +305,9 @@
     "    \n",
     "    # NOTE: `type` is currently a string, does not exist for all classes, and can have typos. \n",
     "    # Both of these are fixed in berkeley schema but is risky to use at this time\n",
+    "\n",
+    "    # TODO: Consider omitting fields that neither (a) are the `id` field, nor (b) have the potential\n",
+    "    #       to reference a document. Those fields aren't related to referential integrity.\n",
     "    \n",
     "    mdb.alldocs.insert_many([assoc(dissoc(doc, 'type', '_id'), 'type', newdoc_type) for doc in mdb[coll_name].find()])\n",
     "    pbar.update(mdb[coll_name].estimated_document_count())\n",

From f607d9e865c8ee93c899d0fd5b38227bee124ac1 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Mon, 27 May 2024 22:15:43 -0700
Subject: [PATCH 13/14] Add comments and prose to final two sections of
 notebook

---
 ...ion_referential_integrity-1715162638.ipynb | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
index c25d8d7e..e7349ce3 100644
--- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
+++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
@@ -414,8 +414,12 @@
    "source": [
     "### Check referential integrity\n",
     "\n",
-    "- \"naive\" errors collected in `not_found` list\n",
-    "- (hierarchy-aware) type errors (doc found, but of invalid type) collected in `invalid_type` list"
+    "In this cell, we populate two lists:\n",
+    "\n",
+    "- `errors.not_found`: a list of \"naive\" errors\n",
+    "- `errors.invalid_type`: a list of (hierarchy-aware) type errors (document was found, but is of an invalid type)\n",
+    "\n",
+    "Reference: https://linkml.io/linkml/developers/schemaview.html#linkml_runtime.utils.schemaview.SchemaView.class_induced_slots"
    ]
   },
   {
@@ -442,13 +446,25 @@
     }
    ],
    "source": [
+    "# Initialize error lists.\n",
     "errors = {\"not_found\": [], \"invalid_type\": []}\n",
     "\n",
+    "# Initialize progress bar.\n",
+    "#\n",
+    "# TODO: Explain why the author has opted to count the documents in the original collections,\n",
+    "#       even though the `alldocs` collection exists now.\n",
+    "#\n",
     "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n",
     "pbar = tqdm(total=n_docs_total)\n",
     "\n",
+    "# Iterate over each collection.\n",
     "for name in sorted(collection_names):\n",
+    "    # Note: We already confirmed (in a different cell of this notebook)\n",
+    "    #       that each `class_names` list has exactly one item.\n",
     "    cls_name = collection_name_to_class_names[name][0]\n",
+    "    # Make a dictionary of slot names to slot definitions. The set of slots here is (to quote the\n",
+    "    # LinkML SchemaView documentation) \"all slots that are asserted or inferred for [the] class,\n",
+    "    # with their inferred semantics.\"\n",
     "    slot_map = {\n",
     "        slot.name: slot\n",
     "        for slot in nmdc_view.class_induced_slots(cls_name)\n",

From 096b88e364ed79d9c84c10e174d56cccff2b3eae Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Mon, 27 May 2024 22:20:36 -0700
Subject: [PATCH 14/14] Add comments and prose to final two sections of
 notebook (for reals)

---
 ...ion_referential_integrity-1715162638.ipynb | 47 +++++++++++++++++--
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
index e7349ce3..6d46e46b 100644
--- a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
+++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
@@ -451,17 +451,18 @@
     "\n",
     "# Initialize progress bar.\n",
     "#\n",
-    "# TODO: Explain why the author has opted to count the documents in the original collections,\n",
-    "#       even though the `alldocs` collection exists now.\n",
+    "# TODO: Explain why the author has opted to count (and then—later—iterate over) the documents\n",
+    "#       in the original collections, even though the `alldocs` collection exists already.\n",
     "#\n",
     "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n",
     "pbar = tqdm(total=n_docs_total)\n",
     "\n",
-    "# Iterate over each collection.\n",
+    "# Iterate over each collection name.\n",
     "for name in sorted(collection_names):\n",
     "    # Note: We already confirmed (in a different cell of this notebook)\n",
     "    #       that each `class_names` list has exactly one item.\n",
     "    cls_name = collection_name_to_class_names[name][0]\n",
+    "    \n",
     "    # Make a dictionary of slot names to slot definitions. The set of slots here is (to quote the\n",
     "    # LinkML SchemaView documentation) \"all slots that are asserted or inferred for [the] class,\n",
     "    # with their inferred semantics.\"\n",
@@ -470,8 +471,12 @@
     "        for slot in nmdc_view.class_induced_slots(cls_name)\n",
     "    }\n",
     "    pbar.set_description(f\"processing {name}...\")\n",
+    "    \n",
+    "    # Iterate over each document (as a dictionary) in this collection.\n",
     "    for doc in mdb[name].find():\n",
     "        doc = dissoc(doc, \"_id\")\n",
+    "\n",
+    "        # Iterate over each key/value pair in the dictionary (document).\n",
     "        for field, value in doc.items():\n",
     "            assert field in slot_map, f\"{name} doc {doc['id']}: field {field} not a valid slot\"\n",
     "            slot_range = str(slot_map[field].range)\n",
@@ -494,7 +499,9 @@
    "id": "9d2ce4a3-fb33-4b47-9c7f-a7919405ab65",
    "metadata": {},
    "source": [
-    "## Results"
+    "## Results\n",
+    "\n",
+    "Display the number errors in each list."
    ]
   },
   {
@@ -518,6 +525,14 @@
     "len(errors[\"not_found\"]), len(errors[\"invalid_type\"])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "54a560df",
+   "metadata": {},
+   "source": [
+    "Display a few errors from one of the lists, as an example."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 16,
@@ -543,6 +558,14 @@
     "errors[\"not_found\"][:5]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "c55c7524",
+   "metadata": {},
+   "source": [
+    "Spot check one of those errors."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 17,
@@ -564,6 +587,14 @@
     "mdb.alldocs.find_one({\"id\": \"nmdc:mga0vx38\"}) is None"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "2bd191cd",
+   "metadata": {},
+   "source": [
+    "Display a few errors from the other one of the lists, as an example."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 18,
@@ -589,6 +620,14 @@
     "errors[\"invalid_type\"][:5]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "d4abec53",
+   "metadata": {},
+   "source": [
+    "Spot check one of those errors."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 19,