diff --git a/.gitignore b/.gitignore
index ed102ea7..a15a9316 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,6 +37,10 @@ MANIFEST
 pip-log.txt
 pip-delete-this-directory.txt
 
+# mongo-restore
+*.tar
+*.agz
+
 # Unit test / coverage reports
 htmlcov/
 .tox/
@@ -55,6 +59,8 @@ coverage.xml
 *.mo
 *.pot
 
+
+
 # Django stuff:
 *.log
 local_settings.py
@@ -103,6 +109,7 @@ celerybeat.pid
 
 # Environments
 .env
+.env.localhost
 .venv
 env/
 venv/
diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb
new file mode 100644
index 00000000..db64cd84
--- /dev/null
+++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb
@@ -0,0 +1,1005 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "51ea05af-7579-43ad-aa9c-3bf8b6da8fdb",
+   "metadata": {},
+   "source": [
+    "# Pipeline to transform the set of nmdc-schema-compliant mongodb collections to an RDF dataset amenable to SPARQL queries."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0675b9ba-c8be-478a-8c72-6edf10f56d8b",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "Before running this notebook, make sure you have done the following:\n",
+    "- `make up-dev` has been run and mongo is mapped to `localhost:27018`\n",
+    "- a recent dump of the production mongo database has been loaded to `localhost:27018` (see `make mongorestore-nmdc-dev` for an example)\n",
+    "- .env has updated `MONGO_HOST` to `mongodb://localhost:27018`\n",
+    "- `export $(grep -v '^#' .env | xargs)` has been run in the shell before running `jupyter notebook`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a362b42f-7ae0-40cf-91d4-8f19ca1087cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ensure code changes in this notebook will be import-able without needing to restart the kernel and lose state\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a456470-920d-4fd4-8040-e0bd3dcabff0",
+   "metadata": {},
+   "source": [
+    "Connect to local dockerized dev environment."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "55932d03-802f-4efe-bceb-e1036cd35567",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MONGO_HOST=mongodb://localhost:27018\n"
+     ]
+    }
+   ],
+   "source": [
+    "!env | grep MONGO_HOST"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a146763-f03a-4d65-baa0-81ca15cba689",
+   "metadata": {},
+   "source": [
+    "Initialize a db connection."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "edb1bb42-005c-49ca-ba59-18c24833f93f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "success\n"
+     ]
+    }
+   ],
+   "source": [
+    "from nmdc_runtime.api.db.mongo import get_mongo_db\n",
+    "mdb = get_mongo_db()\n",
+    "print(\"success\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "37dbc9a8-8cac-4798-8d4f-ccbd9c3560e9",
+   "metadata": {},
+   "source": [
+    "Get all populated nmdc-schema collections with entity `id`s."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "3a0dd489-74cc-47c4-b3e0-c97dd88f5b5f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nmdc_runtime.util import schema_collection_names_with_id_field\n",
+    "\n",
+    "populated_collections = sorted([\n",
+    "    name for name in set(schema_collection_names_with_id_field()) & set(mdb.list_collection_names())\n",
+    "    if mdb[name].estimated_document_count() > 0\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f9a45de7-ba27-4b18-8ff4-9ba44eeb1091",
+   "metadata": {},
+   "source": [
+    "## Get a JSON-LD context for the NMDC Schema, to serialize documents to RDF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "9ed72826-b552-4429-8ab5-9f7126821822",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from pprint import pprint\n",
+    "\n",
+    "from linkml.generators.jsonldcontextgen import ContextGenerator\n",
+    "from nmdc_schema.nmdc_data import get_nmdc_schema_definition\n",
+    "\n",
+    "context = ContextGenerator(get_nmdc_schema_definition())\n",
+    "context = json.loads(context.serialize())[\"@context\"]\n",
+    "\n",
+    "for k, v in list(context.items()):\n",
+    "    if isinstance(v, dict): #and v.get(\"@type\") == \"@id\":\n",
+    "        v.pop(\"@id\", None) # use nmdc uri, not e.g. MIXS uri"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0800c5b9-d09e-4be1-899d-62fcf40a2c0e",
+   "metadata": {},
+   "source": [
+    "Ensure `nmdc:type` has a `URIRef` range, i.e. `nmdc:type a owl:ObjectProperty`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "62a68c07-0706-4300-a48d-0ab628af87b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "context['type'] = {'@type': '@id'}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "63fe4d54-0a41-4170-9310-45e5f47a6cb5",
+   "metadata": {},
+   "source": [
+    "## Initialize an in-memory graph to store triples, prior to serializing to disk"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "648b4f70-34d6-4c70-8d0a-ef76e7e5d96d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from rdflib import Graph\n",
+    "\n",
+    "g = Graph()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "05cb8fd0-b847-49fc-a472-a8df2426168a",
+   "metadata": {},
+   "source": [
+    "Define a helper function to speed up triplification process."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "4d802017-2a7e-4614-b662-6a0cc027b8bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def split_chunk(seq, n: int):\n",
+    "    \"\"\"\n",
+    "    Split sequence into chunks of length n. Do not pad last chunk.\n",
+    "    \n",
+    "    >>> list(split_chunk(list(range(10)), 3))\n",
+    "    [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]\n",
+    "    \"\"\"\n",
+    "    for i in range(0, len(seq), n):\n",
+    "        yield seq[i : i + n]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dfd91d37-b1c7-46ab-b30d-de80132ec091",
+   "metadata": {},
+   "source": [
+    "Define a helper function to ensure each doc has exactly one type."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "86ff7261-e255-415d-a589-67637292dbdd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nmdc_runtime.util import collection_name_to_class_names\n",
+    "\n",
+    "def ensure_type(doc, collection_name):\n",
+    "    if \"type\" in doc:\n",
+    "        return doc\n",
+    "\n",
+    "    class_names = collection_name_to_class_names[collection_name]\n",
+    "    \n",
+    "    if len(class_names) > 1:\n",
+    "        raise Exception(\"cannot unambiguously infer class of document\")\n",
+    "        \n",
+    "    return assoc(doc, \"type\", class_names[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7eedd442-0f26-4829-a878-cf066b3a3912",
+   "metadata": {},
+   "source": [
+    "## Ingest mongo docs to in-memory graph \n",
+    "Uses `rdflib` JSON-LD parsing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4251e0b1-35dc-4f40-91e7-b9bc0d9d79e1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d99c33f951874aea9a4f325086bde0d0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/124 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "loading biosample_set collection\n",
+      "loading data_object_set collection\n",
+      "loading extraction_set collection\n",
+      "loading field_research_site_set collection\n",
+      "loading library_preparation_set collection\n",
+      "loading mags_activity_set collection\n",
+      "loading metabolomics_analysis_activity_set collection\n",
+      "loading metagenome_annotation_activity_set collection\n",
+      "loading metagenome_assembly_set collection\n",
+      "loading metagenome_sequencing_activity_set collection\n",
+      "loading metaproteomics_analysis_activity_set collection\n"
+     ]
+    }
+   ],
+   "source": [
+    "from toolz import assoc, dissoc\n",
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "chunk_size = 2_000\n",
+    "\n",
+    "# setup for progress bar\n",
+    "total = sum((1 + mdb[name].estimated_document_count() // 2_000) for name in populated_collections)\n",
+    "pbar = tqdm(total=total)\n",
+    "\n",
+    "for collection_name in populated_collections:\n",
+    "    print(f\"loading {collection_name} collection\")\n",
+    "    # dissociate mongo-generated `_id` field\n",
+    "    docs = [dissoc(doc, \"_id\") for doc in mdb[collection_name].find()]\n",
+    "    # split collection docs into chunks\n",
+    "    chunks = list(split_chunk(docs, chunk_size))\n",
+    "    \n",
+    "    for chunk in chunks:\n",
+    "        # ensure each doc in chunk is typed\n",
+    "        typed_chunk = [ensure_type(doc, collection_name) for doc in chunk]\n",
+    "        # convert each doc to json_ld\n",
+    "        doc_jsonld = {\"@context\": context, \"@graph\": chunk}\n",
+    "        # add each doc to Graph `g`\n",
+    "        g.parse(data=json.dumps(doc_jsonld), format='json-ld')\n",
+    "        pbar.update(1)\n",
+    "print(f\"{len(g):,} triples loaded\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7140ef42-f94c-45c5-a0c1-31b05718aa4f",
+   "metadata": {},
+   "source": [
+    "Correct URIs that end with newlines, which messes up graph serialization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "ba832848-2cc9-4d1d-bf5f-966a73e26658",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fe36373f43ab43fc85fa302d32fc40cb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/6348584 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from rdflib import Namespace, RDF, Literal, URIRef\n",
+    "\n",
+    "NMDC = Namespace(\"https://w3id.org/nmdc/\")\n",
+    "\n",
+    "for s, p, o in tqdm(g, total=len(g)):\n",
+    "    s_str = str(s)\n",
+    "    if s_str.endswith(\"\\n\"):\n",
+    "        s_str_fixed = str(s_str)[:-2]\n",
+    "        g.remove((s,p,o))\n",
+    "        g.add((URIRef(s_str_fixed), p,o))\n",
+    "    if isinstance(o, URIRef):\n",
+    "        o_str = str(o)\n",
+    "        if o_str.endswith(\"\\n\"):\n",
+    "            o_str_fixed = str(o_str)[:-2]\n",
+    "            g.remove((s,p,o))\n",
+    "            g.add((s, p, URIRef(o_str_fixed)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "71893efc-8e19-465e-a33d-3fe6ee475e05",
+   "metadata": {},
+   "source": [
+    "## Connect Schema-Collection Entities\n",
+    "Given a schema-collection entity (i.e. one with an `id` and its own mongo document), we want to easily find all other schema-collection entities to which it connects, via any slot.\n",
+    "\n",
+    "To do this, we first gather all schema classes that are the type of a schema-collection entity, as well as these class' ancestors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "831cbf19-8331-4f2d-814c-89d86d060029",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from linkml_runtime.utils.schemaview import SchemaView\n",
+    "\n",
+    "from nmdc_runtime.util import nmdc_schema_view, nmdc_database_collection_instance_class_names\n",
+    "\n",
+    "schema_view = nmdc_schema_view()\n",
+    "toplevel_classes = set()\n",
+    "for name in nmdc_database_collection_instance_class_names():\n",
+    "    toplevel_classes |= set(schema_view.class_ancestors(name))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "acdc7a8c-a104-4ac4-b105-0daeaba598a4",
+   "metadata": {},
+   "source": [
+    "Next, we determine which slots have such a \"top-level\" class as its range."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "d402b739-4ab8-4d93-b00f-76f677313c66",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'was_generated_by', 'was_informed_by', 'metagenome_annotation_id', 'has_output', 'part_of', 'collected_from', 'has_input'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "slots = schema_view.all_slots()\n",
+    "\n",
+    "toplevel_entity_connectors = set()\n",
+    "for k, v in context.items():\n",
+    "    if isinstance(v, dict) and \"@type\" in v and v[\"@type\"] == \"@id\":\n",
+    "        if slots[k].range in toplevel_classes and slots[k].domain != \"Database\":\n",
+    "            toplevel_entity_connectors.add(k)\n",
+    "print(toplevel_entity_connectors)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "40e58127-013e-40e2-a839-c9317e14c488",
+   "metadata": {},
+   "source": [
+    "Let's construct an entity-relationship diagram to visualize relationships."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "c99cdd8d-5fd2-44eb-9090-af6f51770fbd",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "classDiagram\n",
+      "\n",
+      "NamedThing --> Activity : was_generated_by\n",
+      "Activity --> Activity : was_informed_by\n",
+      "FunctionalAnnotationAggMember --> WorkflowExecutionActivity : metagenome_annotation_id\n",
+      "NamedThing --> NamedThing : has_output\n",
+      "NamedThing --> NamedThing : part_of\n",
+      "Biosample --> FieldResearchSite : collected_from\n",
+      "NamedThing --> NamedThing : has_input\n",
+      "\n",
+      "MaterialEntity <|-- FieldResearchSite\n",
+      "Activity <|-- MetaproteomicsAnalysisActivity\n",
+      "NamedThing <|-- Site\n",
+      "NamedThing <|-- DataObject\n",
+      "NamedThing <|-- FieldResearchSite\n",
+      "MaterialEntity <|-- Site\n",
+      "Activity <|-- MetatranscriptomeActivity\n",
+      "NamedThing <|-- LibraryPreparation\n",
+      "WorkflowExecutionActivity <|-- MagsAnalysisActivity\n",
+      "NamedThing <|-- PlannedProcess\n",
+      "WorkflowExecutionActivity <|-- ReadBasedTaxonomyAnalysisActivity\n",
+      "Activity <|-- MetagenomeAssembly\n",
+      "WorkflowExecutionActivity <|-- NomAnalysisActivity\n",
+      "PlannedProcess <|-- Extraction\n",
+      "PlannedProcess <|-- LibraryPreparation\n",
+      "PlannedProcess <|-- Pooling\n",
+      "MaterialEntity <|-- ProcessedSample\n",
+      "BiosampleProcessing <|-- LibraryPreparation\n",
+      "NamedThing <|-- Biosample\n",
+      "NamedThing <|-- Pooling\n",
+      "NamedThing <|-- Extraction\n",
+      "Activity <|-- MagsAnalysisActivity\n",
+      "NamedThing <|-- MaterialEntity\n",
+      "MaterialEntity <|-- Biosample\n",
+      "WorkflowExecutionActivity <|-- ReadQcAnalysisActivity\n",
+      "NamedThing <|-- ProcessedSample\n",
+      "WorkflowExecutionActivity <|-- MetagenomeAnnotationActivity\n",
+      "NamedThing <|-- CollectingBiosamplesFromSite\n",
+      "NamedThing <|-- BiosampleProcessing\n",
+      "Activity <|-- NomAnalysisActivity\n",
+      "WorkflowExecutionActivity <|-- MetagenomeSequencingActivity\n",
+      "WorkflowExecutionActivity <|-- MetagenomeAssembly\n",
+      "WorkflowExecutionActivity <|-- MetatranscriptomeActivity\n",
+      "Activity <|-- ReadBasedTaxonomyAnalysisActivity\n",
+      "Activity <|-- MetagenomeAnnotationActivity\n",
+      "Activity <|-- WorkflowExecutionActivity\n",
+      "Site <|-- FieldResearchSite\n",
+      "BiosampleProcessing <|-- Pooling\n",
+      "PlannedProcess <|-- CollectingBiosamplesFromSite\n",
+      "Activity <|-- MetagenomeSequencingActivity\n",
+      "PlannedProcess <|-- BiosampleProcessing\n",
+      "WorkflowExecutionActivity <|-- MetabolomicsAnalysisActivity\n",
+      "WorkflowExecutionActivity <|-- MetaproteomicsAnalysisActivity\n",
+      "NamedThing <|-- OmicsProcessing\n",
+      "Activity <|-- MetabolomicsAnalysisActivity\n",
+      "NamedThing <|-- Study\n",
+      "Activity <|-- ReadQcAnalysisActivity\n",
+      "PlannedProcess <|-- OmicsProcessing\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"classDiagram\\n\")\n",
+    "for slot_name in toplevel_entity_connectors:\n",
+    "    slot = slots[slot_name]\n",
+    "    domain = slot.domain or \"NamedThing\"\n",
+    "    range = slot.range\n",
+    "    print(f\"{domain} --> {range} : {slot_name}\")\n",
+    "\n",
+    "print()\n",
+    "\n",
+    "inheritance_links = set()\n",
+    "for cls in toplevel_classes:\n",
+    "    ancestors = schema_view.class_ancestors(cls)\n",
+    "    for a in ancestors:\n",
+    "        if a != cls:\n",
+    "            inheritance_links.add(f\"{a} <|-- {cls}\")\n",
+    "\n",
+    "for link in inheritance_links:\n",
+    "    print(link)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "63cb2cc8-ef99-4d5f-9ddf-9eb2949e9c06",
+   "metadata": {},
+   "source": [
+    "### Assert a common `depends_on` relation for all entities connected by `toplevel_entity_connectors`\n",
+    "This allows us to traverse the graph of top-level entities without needing to specify any specific slot names."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "cc830d77-5ac2-482e-a4f9-dc2eed3f2ef9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d7bb9d2404eb41159d8d03d895fa66ed",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/15851994 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "16,125,596 triples in total\n"
+     ]
+    }
+   ],
+   "source": [
+    "from rdflib import PROV\n",
+    "\n",
+    "for s, p, o in tqdm(g, total=len(g)):\n",
+    "    if (connector := p.removeprefix(str(NMDC))) in toplevel_entity_connectors:\n",
+    "        if connector == \"has_output\":\n",
+    "            g.add((o, NMDC.depends_on, s))\n",
+    "        else:\n",
+    "            g.add((s, NMDC.depends_on, o))\n",
+    "\n",
+    "print(f\"{len(g):,} triples in total\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4b3dd01c-0f20-40c6-9066-793c9d33b901",
+   "metadata": {},
+   "source": [
+    "### Materialize superclass relations\n",
+    "We want each entity to be associated with its own class and all the classes that its class inherits from. For example an entity of type `Biosample` should also be of type `NamedThing`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "75db4baf-369b-47af-974b-f5298470ad7f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7d41e5fe31fd423cb17f9f0cca75ab72",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/16349744 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "schema_view = nmdc_schema_view()\n",
+    "toplevel_classes = set()\n",
+    "\n",
+    "# get top level class names\n",
+    "for name in nmdc_database_collection_instance_class_names():\n",
+    "    toplevel_classes |= set(getattr(NMDC, a) for a in schema_view.class_ancestors(name))\n",
+    "\n",
+    "# for each triple (s, p, o) in Graph, add all triples (s, p, o') where o' is a class ancestor of o.\n",
+    "for s, p, o in tqdm(g, total=len(g)):\n",
+    "    # get the local predicate name (eg mdb slot name) for that triple\n",
+    "    p_localname = p.removeprefix(str(NMDC))\n",
+    "    # skip if predicate is `type`, as this triple was already loaded \n",
+    "    if p_localname != \"type\":\n",
+    "        continue\n",
+    "    # skip triple if the object is not a top-level class   \n",
+    "    if o not in toplevel_classes:\n",
+    "        continue\n",
+    "    # for each triple where the object is a top-level class,\n",
+    "    # for each `class_ancestor` associated with that top-level class,\n",
+    "    # add the triple (s, `NMDC.type`, `class_ancestor`) \n",
+    "    for a in schema_view.class_ancestors(o.removeprefix(str(NMDC))):\n",
+    "        # print(f\"{a=}\")\n",
+    "        t = (s, NMDC.type, getattr(NMDC,a))\n",
+    "        # pprint(f\"{t=}\")\n",
+    "        g.add(t)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "092657c9-864c-4978-814b-6f587e92887d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Sanity check that we have the right number of ActivitySet records."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "067a53a9-9220-4ee2-bcce-12d6007dab47",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "14889"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len([t for t in g.subjects(NMDC.type, NMDC.Activity)])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "91171cf6-f435-4815-970f-a67f51254997",
+   "metadata": {},
+   "source": [
+    "## Serialize and store as gzipped N-Triples file.\n",
+    "This can take a few minutes..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "125d2ad4-8433-45d8-86c4-d6a619ea5280",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "serializing Graph and writing to file...\n",
+      "success!\n"
+     ]
+    }
+   ],
+   "source": [
+    "import gzip\n",
+    "\n",
+    "with gzip.open('data/nmdc-db.nt.gz', 'wb') as f:\n",
+    "    print(\"Serializing graph and writing to file...\") \n",
+    "    f.write(g.serialize(format='nt').encode())\n",
+    "    print(\"Success!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "48e6d45e-0262-4b3c-982c-478377184c2b",
+   "metadata": {},
+   "source": [
+    "## Load data into a dockerized fuseki server\n",
+    "\n",
+    "1. Add the following to `/nmdc-runtime/docker-compose.yaml`.\n",
+    "\n",
+    "```yml\n",
+    "  fuseki:\n",
+    "    container_name: fuseki\n",
+    "    build:\n",
+    "      dockerfile: nmdc_runtime/fuseki.Dockerfile\n",
+    "      context: .\n",
+    "    ports:\n",
+    "      - \"3030:3030\" # modify port if you already have a service running on localhost:3030\n",
+    "    volumes:\n",
+    "      - ./nmdc_runtime/site/fuseki/fuseki-config.ttl:/configuration/fuseki-config.ttl\n",
+    "      - ./nmdc_runtime/site/fuseki/shiro.ini:/fuseki/run/shiro.ini\n",
+    "      - nmdc_runtime_fuseki_data:/fuseki-base\n",
+    "\n",
+    "volumes:\n",
+    "  nmdc_runtime_fuseki_data:\n",
+    "    driver: local\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3bbf838c-70ec-48e6-ad31-55c61f196195",
+   "metadata": {},
+   "source": [
+    "2. Add the following to `/nmdc-runtime/nmdc-runtime/fuseki.Dockerfile`\n",
+    "\n",
+    "```Dockerfile\n",
+    "# Use an appropriate base image that includes Java and wget\n",
+    "FROM openjdk:11-jre-slim\n",
+    "\n",
+    "# Set environment variables\n",
+    "ENV FUSEKI_VERSION 4.9.0\n",
+    "ENV FUSEKI_HOME /fuseki\n",
+    "\n",
+    "# Install wget\n",
+    "RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*\n",
+    "\n",
+    "# Download and extract Fuseki\n",
+    "RUN wget -qO- https://archive.apache.org/dist/jena/binaries/apache-jena-fuseki-$FUSEKI_VERSION.tar.gz | tar xvz -C / && \\\n",
+    "    mv /apache-jena-fuseki-$FUSEKI_VERSION $FUSEKI_HOME\n",
+    "\n",
+    "# Expose the default port\n",
+    "EXPOSE 3030\n",
+    "\n",
+    "# Download and extract Jena Commands\n",
+    "RUN wget -qO- https://archive.apache.org/dist/jena/binaries/apache-jena-$FUSEKI_VERSION.tar.gz | tar xvz -C / && \\\n",
+    "    mv /apache-jena-$FUSEKI_VERSION $FUSEKI_HOME\n",
+    "\n",
+    "# Copy the Fuseki configuration file to the container\n",
+    "COPY ./nmdc_runtime/site/fuseki/fuseki-config.ttl $FUSEKI_HOME/configuration/\n",
+    "COPY ./nmdc_runtime/site/fuseki/shiro.ini $FUSEKI_HOME/run/\n",
+    "\n",
+    "# Set working directory\n",
+    "WORKDIR $FUSEKI_HOME\n",
+    "\n",
+    "# Command to start Fuseki server with preloaded data\n",
+    "CMD [\"./fuseki-server\", \"--config\", \"configuration/fuseki-config.ttl\"]\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b96560c3-a531-4f8f-be35-6e1a911a90ac",
+   "metadata": {},
+   "source": [
+    "3. Add the following to `/nmdc-runtime/nmdc-runtime/site/fuseki/shiro.ini`\n",
+    "```ini\n",
+    "[main]\n",
+    "localhost=org.apache.jena.fuseki.authz.LocalhostFilter\n",
+    "\n",
+    "[urls]\n",
+    "## Control functions open to anyone\n",
+    "/$/server = anon\n",
+    "/$/ping   = anon\n",
+    "/$/stats = anon\n",
+    "/$/stats/* = anon\n",
+    "## and the rest are restricted to localhost\n",
+    "/$/** = anon\n",
+    "/**=anon\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5d8c0a2-6f75-4dac-9bb9-ac48838ad2b8",
+   "metadata": {},
+   "source": [
+    "4. Add the following to `/nmdc-runtime/nmdc-runtime/site/fuseki/fuseki-config.ttl`\n",
+    "```ttl\n",
+    "@prefix afn: <http://jena.apache.org/ARQ/function#> .\n",
+    "@prefix fuseki: <http://jena.apache.org/fuseki#> .\n",
+    "@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .\n",
+    "@prefix nmdc: <https://w3id.org/nmdc/> .\n",
+    "@prefix owl: <http://www.w3.org/2002/07/owl#> .\n",
+    "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n",
+    "@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n",
+    "@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .\n",
+    "@prefix xs: <http://www.w3.org/2001/XMLSchema#> .\n",
+    "\n",
+    "<https://api.microbiomedata.org/fuseki/#baseModel>\n",
+    "\ta tdb:GraphTDB ;\n",
+    "\ttdb:dataset <https://api.microbiomedata.org/fuseki/#tdbDataset> ;\n",
+    "\t.\n",
+    "\n",
+    "<https://api.microbiomedata.org/fuseki/#dataset>\n",
+    "\ta ja:RDFDataset ;\n",
+    "\tja:defaultGraph <https://api.microbiomedata.org/fuseki/#inferenceModel> ;\n",
+    "\t.\n",
+    "\n",
+    "<https://api.microbiomedata.org/fuseki/#inferenceModel>\n",
+    "\ta ja:InfModel ;\n",
+    "\tja:baseModel <https://api.microbiomedata.org/fuseki/#baseModel> ;\n",
+    "\tja:reasoner [\n",
+    "\t\tja:reasonerURL <http://jena.hpl.hp.com/2003/TransitiveReasoner> ;\n",
+    "\t] ;\n",
+    "\t.\n",
+    "\n",
+    "<https://api.microbiomedata.org/fuseki/#nmdc>\n",
+    "\ta fuseki:Service ;\n",
+    "\tfuseki:dataset <https://api.microbiomedata.org/fuseki/#dataset> ;\n",
+    "\tfuseki:name \"nmdc\" ;\n",
+    "\tfuseki:serviceQuery\n",
+    "\t\t\"query\" ,\n",
+    "\t\t\"sparql\"\n",
+    "\t\t;\n",
+    "\tfuseki:serviceReadWriteGraphStore \"data\" ;\n",
+    "\tfuseki:serviceUpdate \"update\" ;\n",
+    "\tfuseki:serviceUpload \"upload\" ;\n",
+    "\t.\n",
+    "\n",
+    "<https://api.microbiomedata.org/fuseki/#tdbDataset>\n",
+    "\ta tdb:DatasetTDB ;\n",
+    "\tja:context [\n",
+    "\t\trdfs:comment \"Query timeout on this dataset: 10s.\" ;\n",
+    "\t\tja:cxtName \"arq:queryTimeout\" ;\n",
+    "\t\tja:cxtValue \"10000\" ;\n",
+    "\t] ;\n",
+    "\ttdb:location \"/fuseki-base/nmdc-db.tdb\" ;\n",
+    "\t.\n",
+    "\n",
+    "[]\n",
+    "\ta fuseki:Server ;\n",
+    "\tfuseki:services (\n",
+    "\t\t<https://api.microbiomedata.org/fuseki/#nmdc>\n",
+    "\t) ;\n",
+    "\t.\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e1062b76-b7dc-4693-b5ad-91aa9aed490b",
+   "metadata": {},
+   "source": [
+    "5. Spin up a `fuseki` container. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "ea0bdeee-6b3a-4074-bd73-cc9424569346",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n",
+      " \u001b[32m✔\u001b[0m Container fuseki  \u001b[32mRunning\u001b[0m                                               \u001b[34m0.0s \u001b[0m\n",
+      "\u001b[?25h"
+     ]
+    }
+   ],
+   "source": [
+    "!docker compose up fuseki -d"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79284de7-ef52-47c6-aeb1-1453bd4b5f59",
+   "metadata": {},
+   "source": [
+    "Wipe any existing persisted data, and copy new RDF data into the `fuseki` container.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "9037026c-2653-43e3-bb92-2a0eea85b213",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[sPreparing to copy...\u001b[?25l\u001b[u\u001b[2KCopying to container - 0B\u001b[24G\u001b[0K14.5MB\u001b[24G\u001b[0K31.1MB\u001b[24G\u001b[0K46.9MB\u001b[24G\u001b[0K64.2MB\u001b[24G\u001b[0K81.1MB\u001b[24G\u001b[0K96.7MB\u001b[24G\u001b[0K109MB\u001b[24G\u001b[0K123MB\u001b[24G\u001b[0K139MB\u001b[24G\u001b[0K147MB\u001b[24G\u001b[0K156MB\u001b[24G\u001b[0K173MB\u001b[24G\u001b[0K190MB\u001b[24G\u001b[0K206MB\u001b[24G\u001b[0K217MB\u001b[24G\u001b[0K232MB\u001b[24G\u001b[0K247MB\u001b[24G\u001b[0K265MB\u001b[24G\u001b[0K280MB\u001b[24G\u001b[0K298MB\u001b[24G\u001b[0K312MB\u001b[24G\u001b[0K317MB\u001b[24G\u001b[0K337MB\u001b[24G\u001b[0K354MB\u001b[24G\u001b[0K373MB\u001b[24G\u001b[0K393MB\u001b[24G\u001b[0K407MB\u001b[24G\u001b[0K426MB\u001b[24G\u001b[0K442MB\u001b[24G\u001b[0K457MB\u001b[24G\u001b[0K475MB\u001b[24G\u001b[0K492MB\u001b[?25h\u001b[u\u001b[2KSuccessfully copied 502MB to fuseki:/fuseki-base/\n"
+     ]
+    }
+   ],
+   "source": [
+    "!docker exec fuseki rm -rf /fuseki-base/nmdc-db.tdb\n",
+    "!docker cp data/nmdc-db.nt.gz fuseki:/fuseki-base/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4dca86f8-6752-4aba-8d3c-656810f3af3f",
+   "metadata": {},
+   "source": [
+    "Take server down in order to bulk-load data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16f0621c-cf98-4a27-9165-7a0a8711db77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!docker compose down fuseki"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fa4f9843-d5c0-4f8d-bcaf-ad2cf50c0264",
+   "metadata": {},
+   "source": [
+    "Bulk-load data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a490caff-af8a-4537-8c0b-e4a4752645bc",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!docker compose run fuseki ./apache-jena-4.9.0/bin/tdbloader --loc /fuseki-base/nmdc-db.tdb /fuseki-base/nmdc-db.nt.gz"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "69d0e50c-102a-4a8e-9bcd-ef23600afd66",
+   "metadata": {},
+   "source": [
+    "Start up server."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1a0bfb4b-e694-40b1-88af-4446e3fcc888",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!docker compose up fuseki -d"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e528d6a-76b1-4629-82a1-58793ad6a481",
+   "metadata": {},
+   "source": [
+    "Now go to <http://localhost:3030/#/dataset/nmdc/query> and SPARQL it up."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8695001d-9722-48a0-98e8-9ac5000551ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 2024-03-14T09:40 : took <4min to run all the above."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
new file mode 100644
index 00000000..6d46e46b
--- /dev/null
+++ b/metadata-translation/notebooks/repl_validation_referential_integrity-1715162638.ipynb
@@ -0,0 +1,686 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2a66b2dc",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "# Referential integrity checker (prototype)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c892eac06fb1a86a",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "\n",
+    "Before running this notebook, make sure you have done the following:\n",
+    "\n",
+    "1. Run `$ make up-dev`\n",
+    "2. Map `localhost:27018` to the Mongo server you want to use\n",
+    "3. Load a recent dump of the production Mongo database into that Mongo server (see `$ make mongorestore-nmdc-dev` for an example)\n",
+    "4. In the `.env` file, set `MONGO_HOST` to `mongodb://localhost:27018`\n",
+    "5. Run `$ export $(grep -v '^#' .env | xargs)` to load the environment variables defined in `.env` into your shell environment\n",
+    "\n",
+    "Once you've done all of those things, you can run this notebook (e.g. via `$ jupyter notebook`) \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f03ce22",
+   "metadata": {},
+   "source": [
+    "## Enable automatic reloading of modules\n",
+    "\n",
+    "Reference: https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html#autoreload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f1c8bdb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ensure code changes in this notebook will be import-able  \n",
+    "# without needing to restart the kernel and lose state\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5121e612",
+   "metadata": {},
+   "source": [
+    "## Import Python modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f7ff0664-1881-4eca-b018-4c5856dc2489",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from linkml_runtime.utils.schemaview import SchemaView\n",
+    "from toolz import dissoc, assoc\n",
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "from nmdc_runtime.api.db.mongo import get_mongo_db, nmdc_schema_collection_names\n",
+    "from nmdc_runtime.util import collection_name_to_class_names, nmdc_schema_view, nmdc_database_collection_instance_class_names\n",
+    "from nmdc_schema.nmdc_schema_accepting_legacy_ids import Database as NMDCDatabase\n",
+    "from nmdc_schema.get_nmdc_view import ViewGetter\n",
+    "\n",
+    "mdb = get_mongo_db()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bcb5802b-8205-49b7-8784-dc137baff1a0",
+   "metadata": {},
+   "source": [
+    "## \"Pre-clean\" the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ecb1950-eaec-469c-b7ac-949650825093",
+   "metadata": {},
+   "source": [
+    "Determine the name of each Mongo collection in which at least one document has a field named `id`.\n",
+    "\n",
+    "> **TODO:** Documents in the [`functional_annotation_agg` collection](https://microbiomedata.github.io/nmdc-schema/FunctionalAnnotationAggMember/) do not have a field named `id`, and so will not be included here. Document the author's rationale for omitting it.\n",
+    "\n",
+    "> **TODO:** The `nmdc_schema_collection_names` function combines the collection names in Mongo with the Database slots in the schema, and then omits some collection names. Document why the author took that approach."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "dde4c77e-5e06-4751-930a-95906cdf89c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection_names = sorted(nmdc_schema_collection_names(mdb))\n",
+    "collection_names = [n for n in collection_names if mdb[n].find_one({\"id\": {\"$exists\": True}})]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cddaaa54-262d-4549-a9a9-4c280a6a6341",
+   "metadata": {},
+   "source": [
+    "### Remove fields that contain null\n",
+    "\n",
+    "Remove specific fields from specific documents in the above collections, if the field's name appears in our hard-coded list (see the cell below for the list) and — in that document — the field consists of a null value.\n",
+    "\n",
+    "> **TODO:** Document how the author obtained this list and whether the list would require maintenance over time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b71ba7d2-ebd2-487d-a5cc-2a85ee14cb95",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'tqdm' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# check these slots for null values for all docs in collection_names\u001b[39;00m\n\u001b[1;32m      2\u001b[0m props \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mused\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgit_url\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwas_associated_with\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwas_generated_by\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompression_type\u001b[39m\u001b[38;5;124m\"\u001b[39m, \n\u001b[1;32m      3\u001b[0m          \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetagenome_annotation_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetaproteomic_analysis_id\u001b[39m\u001b[38;5;124m\"\u001b[39m] \n\u001b[0;32m----> 4\u001b[0m pbar \u001b[38;5;241m=\u001b[39m \u001b[43mtqdm\u001b[49m(total\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(collection_names))\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m props:\n\u001b[1;32m      6\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m coll_name \u001b[38;5;129;01min\u001b[39;00m collection_names:\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'tqdm' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "# check these slots for null values for all docs in collection_names\n",
+    "props = [\"used\", \"git_url\", \"was_associated_with\", \"was_generated_by\", \"compression_type\", \n",
+    "         \"metagenome_annotation_id\", \"metaproteomic_analysis_id\"] \n",
+    "\n",
+    "pbar = tqdm(total=len(collection_names))\n",
+    "for p in props:\n",
+    "    for coll_name in collection_names:\n",
+    "        pbar.set_description(f\"checking {coll_name}...\")\n",
+    "        # The {$type: 10} query matches for BSON Type Null, not just value `null`\n",
+    "        docs_broken = list(mdb[coll_name].find({p: {\"$type\": 10}}, [\"id\"]))\n",
+    "        if docs_broken:\n",
+    "            print(f\"removing {len(docs_broken)} null-valued {p} values for {coll_name}...\")\n",
+    "            mdb[coll_name].update_many(\n",
+    "                {\"id\": {\"$in\": [d[\"id\"] for d in docs_broken]}},\n",
+    "                {\"$unset\": {p: None}}\n",
+    "            )\n",
+    "        pbar.update(1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21c2f771-b8da-466a-90e8-2c17ac5e6388",
+   "metadata": {},
+   "source": [
+    "## Materialize single-collection view of database"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "56d6c224-ec80-4ac9-9dcf-bf04b33a61f9",
+   "metadata": {},
+   "source": [
+    "Check assumption that every populated collection currently has documents of one type only.\n",
+    "\n",
+    "> **TODO:** The \"class_names\" part of the `collection_name_to_class_names` dictionary does not list _descendant_ classes, even though the schema will allow instances of descendant classes to reside in those collections. Document why disregarding descendant classes here is OK."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "59176b24-2854-4387-891f-a6be2ceca4f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for name in collection_names:\n",
+    "    assert len(collection_name_to_class_names[name]) == 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5ed95ee0-03b7-4dff-80e7-92a2b24bccf4",
+   "metadata": {},
+   "source": [
+    "Define a helper function that takes a class instance and returns a list of the names of its own class and its ancestor classes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "4470c52a-81e4-4511-b549-768c04c3b45d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def class_hierarchy_as_list(obj) -> list[str]:\n",
+    "    r\"\"\"\n",
+    "    Returns a list consisting of the name of the class of the instance pass in,\n",
+    "    and the names of all of its ancestor classes.\n",
+    "\n",
+    "    TODO: Consider renaming function to be a verb; e.g. `get_class_hierarchy_as_list`.\n",
+    "\n",
+    "    TODO: Document the purpose of the `rv` list (does not seem to be used anywhere).\n",
+    "    \"\"\"\n",
+    "\n",
+    "    rv = []\n",
+    "    current_class = obj.__class__\n",
+    "    \n",
+    "    def recurse_through_bases(cls):\n",
+    "        name = cls.__name__\n",
+    "        if name == \"YAMLRoot\":  # base case\n",
+    "            return rv\n",
+    "        rv.append(name)\n",
+    "        for base in cls.__bases__:\n",
+    "            recurse_through_bases(base)  # recursive invocation\n",
+    "        return rv\n",
+    "    \n",
+    "    return recurse_through_bases(current_class)  # initial invocation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b962e3c8-a346-49c5-8470-915f3cf9eb07",
+   "metadata": {},
+   "source": [
+    "Materialize `alldocs` collection, associating all inherited classes with document via `type` field.\n",
+    "\n",
+    "> **TODO:** Clarify the above sentence."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2e618f3-78b9-42b6-8ea9-63d080b1b0f6",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e69c4fd820114e33b11ebae47f9f3e4d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/224995 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "refreshed `alldocs` collection\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Drop any existing `alldocs` collection (e.g. from previous use of this notebook).\n",
+    "mdb.alldocs.drop()\n",
+    "\n",
+    "# Set up progress bar\n",
+    "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n",
+    "pbar = tqdm(total=n_docs_total)\n",
+    "\n",
+    "# for each collection name\n",
+    "for coll_name in collection_names:\n",
+    "    pbar.set_description(f\"processing {coll_name}...\")\n",
+    "    # for each doc in collection, remove the mongo-generated '_id' field\n",
+    "    try:\n",
+    "        nmdcdb = NMDCDatabase(**{coll_name: [dissoc(mdb[coll_name].find_one(), '_id')]})\n",
+    "    except ValueError as e:\n",
+    "        print(f\"no {coll_name}!\")\n",
+    "        raise e\n",
+    "\n",
+    "    # Calculate class_hierarchy_as_list once per collection.\n",
+    "    #\n",
+    "    # Note: This seems to assume that the class hierarchy is identical for each document\n",
+    "    #       in a given collection, which may not be the case since a collection whose\n",
+    "    #       range is a \"parent\" class can store instances of descendant classes (and the\n",
+    "    #       class hierarchy of the latter would differ from that of the former).\n",
+    "    #\n",
+    "    exemplar = getattr(nmdcdb, coll_name)[0]  # get first instance (i.e. document) in list\n",
+    "    newdoc_type: list[str] = class_hierarchy_as_list(exemplar)\n",
+    "    \n",
+    "    # For each document in this collection, replace the value of the `type` field with\n",
+    "    # a _list_ of the document's own class and ancestor classes, remove the `_id` field,\n",
+    "    # and insert the resulting document into the `alldocs` collection. Note that we are not\n",
+    "    # relying on the original value of the `type` field, since it's unreliable (see below).\n",
+    "    \n",
+    "    # NOTE: `type` is currently a string, does not exist for all classes, and can have typos. \n",
+    "    # Both of these are fixed in berkeley schema but is risky to use at this time\n",
+    "\n",
+    "    # TODO: Consider omitting fields that neither (a) are the `id` field, nor (b) have the potential\n",
+    "    #       to reference a document. Those fields aren't related to referential integrity.\n",
+    "    \n",
+    "    mdb.alldocs.insert_many([assoc(dissoc(doc, 'type', '_id'), 'type', newdoc_type) for doc in mdb[coll_name].find()])\n",
+    "    pbar.update(mdb[coll_name].estimated_document_count())\n",
+    "\n",
+    "pbar.close()\n",
+    "\n",
+    "# Prior to re-ID-ing, some IDs are not unique across Mongo collections (eg nmdc:0078a0f981ad3f92693c2bc3b6470791)\n",
+    "# Re-idx for `alldocs` collection\n",
+    "mdb.alldocs.create_index(\"id\")\n",
+    "print(\"refreshed `alldocs` collection\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f0569fde",
+   "metadata": {},
+   "source": [
+    "The resulting `alldocs` collection contains a copy of every document from every Mongo collection identified earlier. The copy is the same as the original document, except that its `type` field contains a list of the names of its own class and all of its ancestor classes (whereas, the original document's `type` field contains an unreliable string)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca194c0f-7417-41d2-bea8-a5a54392fee6",
+   "metadata": {},
+   "source": [
+    "## Validate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab859bb2-808c-48e2-8412-d8a3a79ca4e8",
+   "metadata": {},
+   "source": [
+    "Collect \"top level\" (`nmdc:Database` slot range) classes.\n",
+    "\n",
+    "Reference: https://linkml.io/linkml/developers/schemaview.html#linkml_runtime.utils.schemaview.SchemaView.class_ancestors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "a2dbaf22-46e9-4de7-8288-05bc8cd2e5f8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Activity',\n",
+       " 'Biosample',\n",
+       " 'BiosampleProcessing',\n",
+       " 'CollectingBiosamplesFromSite',\n",
+       " 'DataObject',\n",
+       " 'Extraction',\n",
+       " 'FieldResearchSite',\n",
+       " 'FunctionalAnnotation',\n",
+       " 'FunctionalAnnotationAggMember',\n",
+       " 'GenomeFeature',\n",
+       " 'LibraryPreparation',\n",
+       " 'MagsAnalysisActivity',\n",
+       " 'MaterialEntity',\n",
+       " 'MetabolomicsAnalysisActivity',\n",
+       " 'MetagenomeAnnotationActivity',\n",
+       " 'MetagenomeAssembly',\n",
+       " 'MetagenomeSequencingActivity',\n",
+       " 'MetaproteomicsAnalysisActivity',\n",
+       " 'MetatranscriptomeActivity',\n",
+       " 'NamedThing',\n",
+       " 'NomAnalysisActivity',\n",
+       " 'OmicsProcessing',\n",
+       " 'PlannedProcess',\n",
+       " 'Pooling',\n",
+       " 'ProcessedSample',\n",
+       " 'ReadBasedTaxonomyAnalysisActivity',\n",
+       " 'ReadQcAnalysisActivity',\n",
+       " 'Site',\n",
+       " 'Study',\n",
+       " 'WorkflowExecutionActivity'}"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nmdc_view = nmdc_schema_view()\n",
+    "toplevel_classes = set()\n",
+    "for name in nmdc_database_collection_instance_class_names():\n",
+    "    # TODO: Document why class _ancestors_ are being included here.\n",
+    "    #       A (hypothetical) collection whose range is \"Chihuahua\" wouldn't\n",
+    "    #       be allowed to store non-\"Chihuahua\" instances of \"Dog\" or \"Animal\".\n",
+    "    #\n",
+    "    # Note: `a |= b` is same as `a = a | b` (union two sets and store the result).\n",
+    "    #\n",
+    "    toplevel_classes |= set(nmdc_view.class_ancestors(name))\n",
+    "\n",
+    "toplevel_classes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8645690e-7a9d-4f1e-8e62-0cbdde825890",
+   "metadata": {},
+   "source": [
+    "### Check referential integrity\n",
+    "\n",
+    "In this cell, we populate two lists:\n",
+    "\n",
+    "- `errors.not_found`: a list of \"naive\" errors\n",
+    "- `errors.invalid_type`: a list of (hierarchy-aware) type errors (document was found, but is of an invalid type)\n",
+    "\n",
+    "Reference: https://linkml.io/linkml/developers/schemaview.html#linkml_runtime.utils.schemaview.SchemaView.class_induced_slots"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "103d70b6-24ab-41bd-8b7f-d2faaa028bdf",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5b6ac6cb87b44c28aa65e77f28e5900f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/224995 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Initialize error lists.\n",
+    "errors = {\"not_found\": [], \"invalid_type\": []}\n",
+    "\n",
+    "# Initialize progress bar.\n",
+    "#\n",
+    "# TODO: Explain why the author has opted to count (and then—later—iterate over) the documents\n",
+    "#       in the original collections, even though the `alldocs` collection exists already.\n",
+    "#\n",
+    "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n",
+    "pbar = tqdm(total=n_docs_total)\n",
+    "\n",
+    "# Iterate over each collection name.\n",
+    "for name in sorted(collection_names):\n",
+    "    # Note: We already confirmed (in a different cell of this notebook)\n",
+    "    #       that each `class_names` list has exactly one item.\n",
+    "    cls_name = collection_name_to_class_names[name][0]\n",
+    "    \n",
+    "    # Make a dictionary of slot names to slot definitions. The set of slots here is (to quote the\n",
+    "    # LinkML SchemaView documentation) \"all slots that are asserted or inferred for [the] class,\n",
+    "    # with their inferred semantics.\"\n",
+    "    slot_map = {\n",
+    "        slot.name: slot\n",
+    "        for slot in nmdc_view.class_induced_slots(cls_name)\n",
+    "    }\n",
+    "    pbar.set_description(f\"processing {name}...\")\n",
+    "    \n",
+    "    # Iterate over each document (as a dictionary) in this collection.\n",
+    "    for doc in mdb[name].find():\n",
+    "        doc = dissoc(doc, \"_id\")\n",
+    "\n",
+    "        # Iterate over each key/value pair in the dictionary (document).\n",
+    "        for field, value in doc.items():\n",
+    "            assert field in slot_map, f\"{name} doc {doc['id']}: field {field} not a valid slot\"\n",
+    "            slot_range = str(slot_map[field].range)\n",
+    "            assert slot_range, type(slot_range)\n",
+    "            if not slot_range in toplevel_classes:\n",
+    "                continue\n",
+    "            if not isinstance(value, list):\n",
+    "                value = [value]\n",
+    "            for v in value:\n",
+    "                if mdb.alldocs.find_one({\"id\": v}, [\"_id\"]) is None:\n",
+    "                    errors[\"not_found\"].append(f\"{name} doc {doc['id']}: field {field} referenced doc {v} not found\")\n",
+    "                elif mdb.alldocs.find_one({\"id\": v, \"type\": slot_range}, [\"_id\"]) is None:\n",
+    "                    errors[\"invalid_type\"].append(f\"{name} doc {doc['id']}: field {field} referenced doc {v} not of type {slot_range}\")\n",
+    "        pbar.update(1)\n",
+    "pbar.close()           "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9d2ce4a3-fb33-4b47-9c7f-a7919405ab65",
+   "metadata": {},
+   "source": [
+    "## Results\n",
+    "\n",
+    "Display the number errors in each list."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "e01450d1-3369-4fc5-80be-9787e00a6597",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4857, 23503)"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(errors[\"not_found\"]), len(errors[\"invalid_type\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54a560df",
+   "metadata": {},
+   "source": [
+    "Display a few errors from one of the lists, as an example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "a25857f4-e26e-4896-9e5f-607e7b4bb07c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['mags_activity_set doc nmdc:fdefb3fa15098906cf788f5cadf17bb3: field part_of referenced doc nmdc:mga0vx38 not found',\n",
+       " 'mags_activity_set doc nmdc:78f8bf24916f01d053378b1bd464cd8a: field has_input referenced doc nmdc:9003278a200d1e7921e978d4c59233c3 not found',\n",
+       " 'mags_activity_set doc nmdc:a57ecfc4dee4e6938a5517ad0961dcd8: field part_of referenced doc nmdc:mga08x19 not found',\n",
+       " 'mags_activity_set doc nmdc:3e0d8aae3b16d5bba2b3faec04391929: field part_of referenced doc nmdc:mga06z11 not found',\n",
+       " 'mags_activity_set doc nmdc:4417090e8ce0e96ff2867b85823d4b26: field part_of referenced doc nmdc:mga07m45 not found']"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "errors[\"not_found\"][:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c55c7524",
+   "metadata": {},
+   "source": [
+    "Spot check one of those errors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "855e232d-0e94-428e-96eb-0535c5135bee",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mdb.alldocs.find_one({\"id\": \"nmdc:mga0vx38\"}) is None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2bd191cd",
+   "metadata": {},
+   "source": [
+    "Display a few errors from the other one of the lists, as an example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "33516e3c-f10d-4c30-942b-0d01d06082f9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['data_object_set doc emsl:output_570856: field was_generated_by referenced doc emsl:570856 not of type Activity',\n",
+       " 'data_object_set doc emsl:output_570991: field was_generated_by referenced doc emsl:570991 not of type Activity',\n",
+       " 'data_object_set doc emsl:output_570998: field was_generated_by referenced doc emsl:570998 not of type Activity',\n",
+       " 'data_object_set doc emsl:output_570855: field was_generated_by referenced doc emsl:570855 not of type Activity',\n",
+       " 'data_object_set doc emsl:output_570823: field was_generated_by referenced doc emsl:570823 not of type Activity']"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "errors[\"invalid_type\"][:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d4abec53",
+   "metadata": {},
+   "source": [
+    "Spot check one of those errors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "29ec7e82-d079-4525-bd7b-d770fd69d788",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'_id': ObjectId('663fbef9ba64633177320f59'),\n",
+       " 'id': 'emsl:570856',\n",
+       " 'name': 'Rachael_21T_04-15A_M_14Mar17_leopard_Infuse',\n",
+       " 'instrument_name': '21T Agilent',\n",
+       " 'has_input': ['emsl:2f71038a-5dd1-11ec-bf63-0242ac130002'],\n",
+       " 'has_output': ['emsl:output_570856'],\n",
+       " 'omics_type': {'has_raw_value': 'Organic Matter Characterization'},\n",
+       " 'part_of': ['gold:Gs0110138'],\n",
+       " 'description': 'High resolution MS spectra only',\n",
+       " 'processing_institution': 'EMSL',\n",
+       " 'gold_sequencing_project_identifiers': [],\n",
+       " 'type': ['OmicsProcessing', 'PlannedProcess', 'NamedThing']}"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# OmicsProcessing is not subclass of Activity (!)\n",
+    "mdb.alldocs.find_one({\"id\": \"emsl:570856\"})"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}