From 7504bfd2a07479f04aa053c809540ed474f9d7e8 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Tue, 5 Mar 2024 14:22:05 -0500 Subject: [PATCH 01/18] feat: triplified schema mongo collections served by jena fuseki --- docker-compose.yml | 23 +- .../notebooks/ghissue_401_sparql.ipynb | 788 ++++++++++++++++++ nmdc_runtime/fuseki.Dockerfile | 30 + nmdc_runtime/site/fuseki/fuseki-config.ttl | 32 + nmdc_runtime/site/fuseki/shiro.ini | 13 + 5 files changed, 874 insertions(+), 12 deletions(-) create mode 100644 metadata-translation/notebooks/ghissue_401_sparql.ipynb create mode 100644 nmdc_runtime/fuseki.Dockerfile create mode 100644 nmdc_runtime/site/fuseki/fuseki-config.ttl create mode 100644 nmdc_runtime/site/fuseki/shiro.ini diff --git a/docker-compose.yml b/docker-compose.yml index 8b544298..dc492158 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -88,20 +88,17 @@ services: MONGO_INITDB_ROOT_USERNAME: admin MONGO_INITDB_ROOT_PASSWORD: root - terminus: - image: terminusdb/terminusdb-server:v11.0.6 - container_name: terminus + fuseki: + container_name: fuseki + build: + dockerfile: nmdc_runtime/fuseki.Dockerfile + context: . ports: - - "6364:6363" - tty: true + - "3030:3030" volumes: - - nmdc_runtime_terminus_data:/app/terminusdb/storage - restart: unless-stopped - environment: - TERMINUSDB_SERVER_PORT: 6363 - TERMINUSDB_ADMIN_PASS: root - TERMINUSDB_AUTOLOGIN_ENABLED: "true" - TERMINUSDB_HTTPS_ENABLED: "false" + - ./nmdc_runtime/site/fuseki/fuseki-config.ttl:/configuration/fuseki-config.ttl + - ./nmdc_runtime/site/fuseki/shiro.ini:/fuseki/run/shiro.ini + - nmdc_runtime_fuseki_data:/fuseki-base volumes: nmdc_runtime_postgres_data: @@ -110,6 +107,8 @@ volumes: driver: local nmdc_runtime_terminus_data: driver: local + nmdc_runtime_fuseki_data: + driver: local secrets: mongoKeyFile: diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb new file mode 100644 index 00000000..b8504f40 --- /dev/null +++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb @@ -0,0 +1,788 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "51ea05af-7579-43ad-aa9c-3bf8b6da8fdb", + "metadata": {}, + "source": [ + "Pipeline to transform the set of nmdc-schema-compliant mongodb collections to an RDF dataset amenable to SPARQL queries." + ] + }, + { + "cell_type": "markdown", + "id": "ae2673a5-560b-47b0-9608-656aa3854466", + "metadata": {}, + "source": [ + "Ensure that changes to the code will be import-able in this notebook without needing restart the kernel and thus lose state." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a362b42f-7ae0-40cf-91d4-8f19ca1087cf", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "id": "0b8b1fb7-2357-46ef-8d86-69cd1dce228d", + "metadata": {}, + "source": [ + "Connect to local dockerized dev environment." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "55932d03-802f-4efe-bceb-e1036cd35567", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MONGO_HOST=localhost:27018\n" + ] + } + ], + "source": [ + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv(\".env.localhost\")\n", + "!env | grep MONGO_HOST" + ] + }, + { + "cell_type": "markdown", + "id": "3a146763-f03a-4d65-baa0-81ca15cba689", + "metadata": {}, + "source": [ + "Initialize a db connection." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "edb1bb42-005c-49ca-ba59-18c24833f93f", + "metadata": {}, + "outputs": [], + "source": [ + "from nmdc_runtime.api.db.mongo import get_mongo_db\n", + "\n", + "mdb = get_mongo_db()" + ] + }, + { + "cell_type": "markdown", + "id": "37dbc9a8-8cac-4798-8d4f-ccbd9c3560e9", + "metadata": {}, + "source": [ + "Get all populated nmdc-schema collections with entity `id`s." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3a0dd489-74cc-47c4-b3e0-c97dd88f5b5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['biosample_set',\n", + " 'data_object_set',\n", + " 'extraction_set',\n", + " 'field_research_site_set',\n", + " 'library_preparation_set',\n", + " 'mags_activity_set',\n", + " 'metabolomics_analysis_activity_set',\n", + " 'metagenome_annotation_activity_set',\n", + " 'metagenome_assembly_set',\n", + " 'metagenome_sequencing_activity_set',\n", + " 'metaproteomics_analysis_activity_set',\n", + " 'metatranscriptome_activity_set',\n", + " 'nom_analysis_activity_set',\n", + " 'omics_processing_set',\n", + " 'pooling_set',\n", + " 'processed_sample_set',\n", + " 'read_based_taxonomy_analysis_activity_set',\n", + " 'read_qc_analysis_activity_set',\n", + " 'study_set']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from nmdc_runtime.util import schema_collection_names_with_id_field\n", + "\n", + "populated_collections = sorted([\n", + " name for name in set(schema_collection_names_with_id_field()) & set(mdb.list_collection_names())\n", + " if mdb[name].estimated_document_count() > 0\n", + "])\n", + "populated_collections" + ] + }, + { + "cell_type": "markdown", + "id": "f9a45de7-ba27-4b18-8ff4-9ba44eeb1091", + "metadata": {}, + "source": [ + "Get a JSON-LD context for the NMDC Schema, to serialize documents to RDF." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9ed72826-b552-4429-8ab5-9f7126821822", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from pprint import pprint\n", + "\n", + "from linkml.generators.jsonldcontextgen import ContextGenerator\n", + "from nmdc_schema.nmdc_data import get_nmdc_schema_definition\n", + "\n", + "context = ContextGenerator(get_nmdc_schema_definition())\n", + "context = json.loads(context.serialize())[\"@context\"]\n", + "\n", + "for k, v in list(context.items()):\n", + " if isinstance(v, dict): #and v.get(\"@type\") == \"@id\":\n", + " v.pop(\"@id\", None) # use nmdc uri, not e.g. MIXS uri" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "648b4f70-34d6-4c70-8d0a-ef76e7e5d96d", + "metadata": {}, + "outputs": [], + "source": [ + "from rdflib import Graph\n", + "\n", + "g = Graph()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "4d802017-2a7e-4614-b662-6a0cc027b8bc", + "metadata": {}, + "outputs": [], + "source": [ + "def split_chunk(seq, n: int):\n", + " \"\"\"\n", + " Split sequence into chunks of length n. Do not pad last chunk.\n", + " \n", + " >>> list(split_chunk(list(range(10)), 3))\n", + " [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]\n", + " \"\"\"\n", + " for i in range(0, len(seq), n):\n", + " yield seq[i : i + n]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "4251e0b1-35dc-4f40-91e7-b9bc0d9d79e1", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0ac75a239524499d901b1b5a25bc74f7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/112 [00:00POS: 100,000 slots (Batch: 1,333,333 slots/s / Avg: 1,333,333 slots/s)\n", + "19:15:20 INFO loader :: Index SPO->POS: 200,000 slots (Batch: 952,380 slots/s / Avg: 1,111,111 slots/s)\n", + "19:15:20 INFO loader :: Index SPO->POS: 300,000 slots (Batch: 1,190,476 slots/s / Avg: 1,136,363 slots/s)\n", + "19:15:20 INFO loader :: Index SPO->POS: 400,000 slots (Batch: 1,123,595 slots/s / Avg: 1,133,144 slots/s)\n", + "19:15:20 INFO loader :: Index SPO->POS: 500,000 slots (Batch: 1,111,111 slots/s / Avg: 1,128,668 slots/s)\n", + "19:15:20 INFO loader :: Index SPO->POS: 600,000 slots (Batch: 862,068 slots/s / Avg: 1,073,345 slots/s)\n", + "19:15:20 INFO loader :: Index SPO->POS: 700,000 slots (Batch: 1,052,631 slots/s / Avg: 1,070,336 slots/s)\n", + "19:15:20 INFO loader :: Index SPO->POS: 800,000 slots (Batch: 1,176,470 slots/s / Avg: 1,082,543 slots/s)\n", + "19:15:20 INFO loader :: Index SPO->POS: 900,000 slots (Batch: 900,900 slots/s / Avg: 1,058,823 slots/s)\n", + "19:15:21 INFO loader :: Index SPO->POS: 1,000,000 slots (Batch: 1,149,425 slots/s / Avg: 1,067,235 slots/s)\n", + "19:15:21 INFO loader :: Elapsed: 49.21 seconds [2024/03/05 19:15:21 UTC]\n", + "19:15:21 INFO loader :: Index SPO->POS: 1,100,000 slots (Batch: 1,086,956 slots/s / Avg: 1,068,999 slots/s)\n", + "19:15:21 INFO loader :: Index SPO->POS: 1,200,000 slots (Batch: 840,336 slots/s / Avg: 1,045,296 slots/s)\n", + "19:15:21 INFO loader :: Index SPO->POS: 1,300,000 slots (Batch: 970,873 slots/s / Avg: 1,039,168 slots/s)\n", + "19:15:21 INFO loader :: Index SPO->POS: 1,400,000 slots (Batch: 970,873 slots/s / Avg: 1,033,973 slots/s)\n", + "19:15:21 INFO loader :: Index SPO->POS: 1,500,000 slots (Batch: 952,380 slots/s / Avg: 1,028,101 slots/s)\n", + "19:15:21 INFO loader :: Index SPO->POS: 1,600,000 slots (Batch: 826,446 slots/s / Avg: 1,012,658 slots/s)\n", + "19:15:21 INFO loader :: Index SPO->POS: 1,700,000 slots (Batch: 1,030,927 slots/s / Avg: 1,013,714 slots/s)\n", + "19:15:21 INFO loader :: Index SPO->POS: 1,800,000 slots (Batch: 1,010,101 slots/s / Avg: 1,013,513 slots/s)\n", + "19:15:21 INFO loader :: Index SPO->POS: 1,900,000 slots (Batch: 1,010,101 slots/s / Avg: 1,013,333 slots/s)\n", + "19:15:22 INFO loader :: Index SPO->POS: 2,000,000 slots (Batch: 781,250 slots/s / Avg: 998,502 slots/s)\n", + "19:15:22 INFO loader :: Elapsed: 50.27 seconds [2024/03/05 19:15:22 UTC]\n", + "19:15:22 INFO loader :: Index SPO->POS: 2,100,000 slots (Batch: 961,538 slots/s / Avg: 996,677 slots/s)\n", + "19:15:22 INFO loader :: Index SPO->POS: 2,200,000 slots (Batch: 970,873 slots/s / Avg: 995,475 slots/s)\n", + "19:15:22 INFO loader :: Index SPO->POS: 2,300,000 slots (Batch: 943,396 slots/s / Avg: 993,091 slots/s)\n", + "19:15:22 INFO loader :: Index SPO->POS: 2,400,000 slots (Batch: 740,740 slots/s / Avg: 979,192 slots/s)\n", + "19:15:22 INFO loader :: Index SPO->POS: 2,500,000 slots (Batch: 892,857 slots/s / Avg: 975,419 slots/s)\n", + "19:15:22 INFO loader :: Index SPO->POS: 2,600,000 slots (Batch: 877,192 slots/s / Avg: 971,236 slots/s)\n", + "19:15:22 INFO loader :: Index SPO->POS: 2,700,000 slots (Batch: 917,431 slots/s / Avg: 969,131 slots/s)\n", + "19:15:22 INFO loader :: Index SPO->POS: 2,800,000 slots (Batch: 781,250 slots/s / Avg: 960,878 slots/s)\n", + "19:15:23 INFO loader :: Index SPO->POS: 2,900,000 slots (Batch: 877,192 slots/s / Avg: 957,727 slots/s)\n", + "19:15:23 INFO loader :: Index SPO->POS: 3,000,000 slots (Batch: 847,457 slots/s / Avg: 953,591 slots/s)\n", + "19:15:23 INFO loader :: Elapsed: 51.41 seconds [2024/03/05 19:15:23 UTC]\n", + "19:15:23 INFO loader :: Index SPO->POS: 3,100,000 slots (Batch: 751,879 slots/s / Avg: 945,410 slots/s)\n", + "19:15:23 INFO loader :: Index SPO->POS: 3,200,000 slots (Batch: 884,955 slots/s / Avg: 943,396 slots/s)\n", + "19:15:23 INFO loader :: Index SPO->POS: 3,300,000 slots (Batch: 847,457 slots/s / Avg: 940,170 slots/s)\n", + "19:15:23 INFO loader :: Index SPO->POS: 3,400,000 slots (Batch: 877,192 slots/s / Avg: 938,189 slots/s)\n", + "19:15:23 INFO loader :: Index SPO->POS: 3,500,000 slots (Batch: 704,225 slots/s / Avg: 929,368 slots/s)\n", + "19:15:23 INFO loader :: Index SPO->POS: 3,600,000 slots (Batch: 819,672 slots/s / Avg: 925,925 slots/s)\n", + "19:15:24 INFO loader :: Index SPO->POS: 3,700,000 slots (Batch: 806,451 slots/s / Avg: 922,233 slots/s)\n", + "19:15:24 INFO loader :: Index SPO->POS: 3,800,000 slots (Batch: 847,457 slots/s / Avg: 920,096 slots/s)\n", + "19:15:24 INFO loader :: Index SPO->POS: 3,900,000 slots (Batch: 714,285 slots/s / Avg: 913,348 slots/s)\n", + "19:15:24 INFO loader :: Index SPO->POS: 4,000,000 slots (Batch: 840,336 slots/s / Avg: 911,369 slots/s)\n", + "19:15:24 INFO loader :: Elapsed: 52.66 seconds [2024/03/05 19:15:24 UTC]\n", + "19:15:24 INFO loader :: Index SPO->POS: 4,100,000 slots (Batch: 847,457 slots/s / Avg: 909,696 slots/s)\n", + "19:15:24 INFO loader :: Index SPO->POS: 4,200,000 slots (Batch: 847,457 slots/s / Avg: 908,108 slots/s)\n", + "19:15:24 INFO loader :: Index SPO->POS: 4,300,000 slots (Batch: 709,219 slots/s / Avg: 902,224 slots/s)\n", + "19:15:24 INFO loader :: Index SPO->POS: 4,400,000 slots (Batch: 854,700 slots/s / Avg: 901,085 slots/s)\n", + "19:15:25 INFO loader :: Index SPO->POS: 4,500,000 slots (Batch: 840,336 slots/s / Avg: 899,640 slots/s)\n", + "19:15:25 INFO loader :: Index SPO->POS: 4,600,000 slots (Batch: 813,008 slots/s / Avg: 897,560 slots/s)\n", + "19:15:25 INFO loader :: Index SPO->POS: 4,700,000 slots (Batch: 724,637 slots/s / Avg: 893,026 slots/s)\n", + "19:15:25 INFO loader :: Index SPO->POS: 4,800,000 slots (Batch: 833,333 slots/s / Avg: 891,696 slots/s)\n", + "19:15:25 INFO loader :: Index SPO->POS: 4,900,000 slots (Batch: 833,333 slots/s / Avg: 890,423 slots/s)\n", + "19:15:25 INFO loader :: Index SPO->POS: 5,000,000 slots (Batch: 781,250 slots/s / Avg: 887,941 slots/s)\n", + "19:15:25 INFO loader :: Elapsed: 53.90 seconds [2024/03/05 19:15:25 UTC]\n", + "19:15:25 INFO loader :: Index SPO->POS: 5,100,000 slots (Batch: 694,444 slots/s / Avg: 883,116 slots/s)\n", + "19:15:25 INFO loader :: Index SPO->POS: 5,200,000 slots (Batch: 763,358 slots/s / Avg: 880,460 slots/s)\n", + "19:15:26 INFO loader :: Index SPO->POS: 5,300,000 slots (Batch: 757,575 slots/s / Avg: 877,774 slots/s)\n", + "19:15:26 INFO loader :: Index SPO->POS: 5,400,000 slots (Batch: 746,268 slots/s / Avg: 874,918 slots/s)\n", + "19:15:26 INFO loader :: Index SPO->POS: 5,500,000 slots (Batch: 704,225 slots/s / Avg: 871,080 slots/s)\n", + "19:15:26 INFO loader :: Index SPO->POS: 5,600,000 slots (Batch: 793,650 slots/s / Avg: 869,565 slots/s)\n", + "19:15:26 INFO loader :: Index SPO->POS: 5,700,000 slots (Batch: 787,401 slots/s / Avg: 867,976 slots/s)\n", + "19:15:26 INFO loader :: Index SPO->POS: 5,800,000 slots (Batch: 800,000 slots/s / Avg: 866,706 slots/s)\n", + "19:15:26 INFO loader :: Index SPO->POS: 5,900,000 slots (Batch: 787,401 slots/s / Avg: 865,229 slots/s)\n", + "19:15:27 INFO loader :: Index SPO->POS: 6,000,000 slots (Batch: 813,008 slots/s / Avg: 864,304 slots/s)\n", + "19:15:27 INFO loader :: Elapsed: 55.21 seconds [2024/03/05 19:15:27 UTC]\n", + "19:15:27 INFO loader :: Index SPO->POS: 6,100,000 slots (Batch: 769,230 slots/s / Avg: 862,556 slots/s)\n", + "19:15:27 INFO loader :: Index SPO->POS: 6,200,000 slots (Batch: 787,401 slots/s / Avg: 861,230 slots/s)\n", + "19:15:27 INFO loader :: Index SPO->POS: 6,300,000 slots (Batch: 781,250 slots/s / Avg: 859,833 slots/s)\n", + "19:15:27 INFO loader :: Index SPO->POS: 6,400,000 slots (Batch: 806,451 slots/s / Avg: 858,945 slots/s)\n", + "19:15:27 INFO loader :: Index SPO->POS: 6,500,000 slots (Batch: 819,672 slots/s / Avg: 858,312 slots/s)\n", + "19:15:27 INFO loader :: ** Index SPO->POS: 6,546,004 slots indexed in 7.63 seconds [Rate: 858,042.19 per second]\n", + "19:15:27 INFO loader :: Index SPO->OSP: 100,000 slots (Batch: 1,923,076 slots/s / Avg: 1,923,076 slots/s)\n", + "19:15:27 INFO loader :: Index SPO->OSP: 200,000 slots (Batch: 1,538,461 slots/s / Avg: 1,709,401 slots/s)\n", + "19:15:27 INFO loader :: Index SPO->OSP: 300,000 slots (Batch: 1,538,461 slots/s / Avg: 1,648,351 slots/s)\n", + "19:15:27 INFO loader :: Index SPO->OSP: 400,000 slots (Batch: 1,449,275 slots/s / Avg: 1,593,625 slots/s)\n", + "19:15:28 INFO loader :: Index SPO->OSP: 500,000 slots (Batch: 1,351,351 slots/s / Avg: 1,538,461 slots/s)\n", + "19:15:28 INFO loader :: Index SPO->OSP: 600,000 slots (Batch: 1,250,000 slots/s / Avg: 1,481,481 slots/s)\n", + "19:15:28 INFO loader :: Index SPO->OSP: 700,000 slots (Batch: 1,219,512 slots/s / Avg: 1,437,371 slots/s)\n", + "19:15:28 INFO loader :: Index SPO->OSP: 800,000 slots (Batch: 1,063,829 slots/s / Avg: 1,376,936 slots/s)\n", + "19:15:28 INFO loader :: Index SPO->OSP: 900,000 slots (Batch: 1,075,268 slots/s / Avg: 1,335,311 slots/s)\n", + "19:15:28 INFO loader :: Index SPO->OSP: 1,000,000 slots (Batch: 1,075,268 slots/s / Avg: 1,303,780 slots/s)\n", + "19:15:28 INFO loader :: Elapsed: 56.66 seconds [2024/03/05 19:15:28 UTC]\n", + "19:15:28 INFO loader :: Index SPO->OSP: 1,100,000 slots (Batch: 1,098,901 slots/s / Avg: 1,282,051 slots/s)\n", + "19:15:28 INFO loader :: Index SPO->OSP: 1,200,000 slots (Batch: 1,111,111 slots/s / Avg: 1,265,822 slots/s)\n", + "19:15:28 INFO loader :: Index SPO->OSP: 1,300,000 slots (Batch: 1,136,363 slots/s / Avg: 1,254,826 slots/s)\n", + "19:15:28 INFO loader :: Index SPO->OSP: 1,400,000 slots (Batch: 1,030,927 slots/s / Avg: 1,235,657 slots/s)\n", + "19:15:28 INFO loader :: Index SPO->OSP: 1,500,000 slots (Batch: 1,086,956 slots/s / Avg: 1,224,489 slots/s)\n", + "19:15:29 INFO loader :: Index SPO->OSP: 1,600,000 slots (Batch: 1,086,956 slots/s / Avg: 1,214,882 slots/s)\n", + "19:15:29 INFO loader :: Index SPO->OSP: 1,700,000 slots (Batch: 1,000,000 slots/s / Avg: 1,199,717 slots/s)\n", + "19:15:29 INFO loader :: Index SPO->OSP: 1,800,000 slots (Batch: 1,030,927 slots/s / Avg: 1,188,903 slots/s)\n", + "19:15:29 INFO loader :: Index SPO->OSP: 1,900,000 slots (Batch: 1,010,101 slots/s / Avg: 1,177,929 slots/s)\n", + "19:15:29 INFO loader :: Index SPO->OSP: 2,000,000 slots (Batch: 990,099 slots/s / Avg: 1,166,861 slots/s)\n", + "19:15:29 INFO loader :: Elapsed: 57.61 seconds [2024/03/05 19:15:29 UTC]\n", + "19:15:29 INFO loader :: Index SPO->OSP: 2,100,000 slots (Batch: 1,030,927 slots/s / Avg: 1,159,580 slots/s)\n", + "19:15:29 INFO loader :: Index SPO->OSP: 2,200,000 slots (Batch: 1,010,101 slots/s / Avg: 1,151,832 slots/s)\n", + "19:15:29 INFO loader :: Index SPO->OSP: 2,300,000 slots (Batch: 1,010,101 slots/s / Avg: 1,144,848 slots/s)\n", + "19:15:29 INFO loader :: Index SPO->OSP: 2,400,000 slots (Batch: 990,099 slots/s / Avg: 1,137,440 slots/s)\n", + "19:15:29 INFO loader :: Index SPO->OSP: 2,500,000 slots (Batch: 990,099 slots/s / Avg: 1,130,710 slots/s)\n", + "19:15:30 INFO loader :: Index SPO->OSP: 2,600,000 slots (Batch: 1,030,927 slots/s / Avg: 1,126,516 slots/s)\n", + "19:15:30 INFO loader :: Index SPO->OSP: 2,700,000 slots (Batch: 970,873 slots/s / Avg: 1,119,867 slots/s)\n", + "19:15:30 INFO loader :: Index SPO->OSP: 2,800,000 slots (Batch: 961,538 slots/s / Avg: 1,113,320 slots/s)\n", + "19:15:30 INFO loader :: Index SPO->OSP: 2,900,000 slots (Batch: 961,538 slots/s / Avg: 1,107,292 slots/s)\n", + "19:15:30 INFO loader :: Index SPO->OSP: 3,000,000 slots (Batch: 1,010,101 slots/s / Avg: 1,103,752 slots/s)\n", + "19:15:30 INFO loader :: Elapsed: 58.62 seconds [2024/03/05 19:15:30 UTC]\n", + "19:15:30 INFO loader :: Index SPO->OSP: 3,100,000 slots (Batch: 1,010,101 slots/s / Avg: 1,100,461 slots/s)\n", + "19:15:30 INFO loader :: Index SPO->OSP: 3,200,000 slots (Batch: 952,380 slots/s / Avg: 1,095,140 slots/s)\n", + "19:15:30 INFO loader :: Index SPO->OSP: 3,300,000 slots (Batch: 961,538 slots/s / Avg: 1,090,548 slots/s)\n", + "19:15:30 INFO loader :: Index SPO->OSP: 3,400,000 slots (Batch: 952,380 slots/s / Avg: 1,085,915 slots/s)\n", + "19:15:30 INFO loader :: Index SPO->OSP: 3,500,000 slots (Batch: 1,010,101 slots/s / Avg: 1,083,591 slots/s)\n", + "19:15:31 INFO loader :: Index SPO->OSP: 3,600,000 slots (Batch: 990,099 slots/s / Avg: 1,080,756 slots/s)\n", + "19:15:31 INFO loader :: Index SPO->OSP: 3,700,000 slots (Batch: 934,579 slots/s / Avg: 1,076,207 slots/s)\n", + "19:15:31 INFO loader :: Index SPO->OSP: 3,800,000 slots (Batch: 952,380 slots/s / Avg: 1,072,537 slots/s)\n", + "19:15:31 INFO loader :: Index SPO->OSP: 3,900,000 slots (Batch: 934,579 slots/s / Avg: 1,068,493 slots/s)\n", + "19:15:31 INFO loader :: Index SPO->OSP: 4,000,000 slots (Batch: 970,873 slots/s / Avg: 1,065,814 slots/s)\n", + "19:15:31 INFO loader :: Elapsed: 59.65 seconds [2024/03/05 19:15:31 UTC]\n", + "19:15:31 INFO loader :: Index SPO->OSP: 4,100,000 slots (Batch: 970,873 slots/s / Avg: 1,063,278 slots/s)\n", + "19:15:31 INFO loader :: Index SPO->OSP: 4,200,000 slots (Batch: 952,380 slots/s / Avg: 1,060,338 slots/s)\n", + "19:15:31 INFO loader :: Index SPO->OSP: 4,300,000 slots (Batch: 917,431 slots/s / Avg: 1,056,511 slots/s)\n", + "19:15:31 INFO loader :: Index SPO->OSP: 4,400,000 slots (Batch: 952,380 slots/s / Avg: 1,053,892 slots/s)\n", + "19:15:31 INFO loader :: Index SPO->OSP: 4,500,000 slots (Batch: 970,873 slots/s / Avg: 1,051,893 slots/s)\n", + "19:15:32 INFO loader :: Index SPO->OSP: 4,600,000 slots (Batch: 909,090 slots/s / Avg: 1,048,313 slots/s)\n", + "19:15:32 INFO loader :: Index SPO->OSP: 4,700,000 slots (Batch: 909,090 slots/s / Avg: 1,044,908 slots/s)\n", + "19:15:32 INFO loader :: Index SPO->OSP: 4,800,000 slots (Batch: 909,090 slots/s / Avg: 1,041,666 slots/s)\n", + "19:15:32 INFO loader :: Index SPO->OSP: 4,900,000 slots (Batch: 917,431 slots/s / Avg: 1,038,795 slots/s)\n", + "19:15:32 INFO loader :: Index SPO->OSP: 5,000,000 slots (Batch: 877,192 slots/s / Avg: 1,034,982 slots/s)\n", + "19:15:32 INFO loader :: Elapsed: 60.73 seconds [2024/03/05 19:15:32 UTC]\n", + "19:15:32 INFO loader :: Index SPO->OSP: 5,100,000 slots (Batch: 869,565 slots/s / Avg: 1,031,136 slots/s)\n", + "19:15:32 INFO loader :: Index SPO->OSP: 5,200,000 slots (Batch: 909,090 slots/s / Avg: 1,028,481 slots/s)\n", + "19:15:32 INFO loader :: Index SPO->OSP: 5,300,000 slots (Batch: 869,565 slots/s / Avg: 1,024,946 slots/s)\n", + "19:15:32 INFO loader :: Index SPO->OSP: 5,400,000 slots (Batch: 854,700 slots/s / Avg: 1,021,180 slots/s)\n", + "19:15:33 INFO loader :: Index SPO->OSP: 5,500,000 slots (Batch: 854,700 slots/s / Avg: 1,017,576 slots/s)\n", + "19:15:33 INFO loader :: Index SPO->OSP: 5,600,000 slots (Batch: 854,700 slots/s / Avg: 1,014,125 slots/s)\n", + "19:15:33 INFO loader :: Index SPO->OSP: 5,700,000 slots (Batch: 862,068 slots/s / Avg: 1,010,996 slots/s)\n", + "19:15:33 INFO loader :: Index SPO->OSP: 5,800,000 slots (Batch: 934,579 slots/s / Avg: 1,009,573 slots/s)\n", + "19:15:33 INFO loader :: Index SPO->OSP: 5,900,000 slots (Batch: 892,857 slots/s / Avg: 1,007,341 slots/s)\n", + "19:15:33 INFO loader :: Index SPO->OSP: 6,000,000 slots (Batch: 892,857 slots/s / Avg: 1,005,193 slots/s)\n", + "19:15:33 INFO loader :: Elapsed: 61.87 seconds [2024/03/05 19:15:33 UTC]\n", + "19:15:33 INFO loader :: Index SPO->OSP: 6,100,000 slots (Batch: 884,955 slots/s / Avg: 1,002,959 slots/s)\n", + "19:15:33 INFO loader :: Index SPO->OSP: 6,200,000 slots (Batch: 900,900 slots/s / Avg: 1,001,130 slots/s)\n", + "19:15:34 INFO loader :: Index SPO->OSP: 6,300,000 slots (Batch: 884,955 slots/s / Avg: 999,048 slots/s)\n", + "19:15:34 INFO loader :: Index SPO->OSP: 6,400,000 slots (Batch: 892,857 slots/s / Avg: 997,195 slots/s)\n", + "19:15:34 INFO loader :: Index SPO->OSP: 6,500,000 slots (Batch: 892,857 slots/s / Avg: 995,405 slots/s)\n", + "19:15:34 INFO loader :: ** Index SPO->OSP: 6,546,004 slots indexed in 6.58 seconds [Rate: 994,833.44 per second]\n", + "19:15:34 INFO loader :: -- Finish triples index phase\n", + "19:15:34 INFO loader :: ** 6,546,004 triples indexed in 14.21 seconds [Rate: 460,629.38 per second]\n", + "19:15:34 INFO loader :: -- Finish triples load\n", + "19:15:34 INFO loader :: ** Completed: 6,546,004 triples loaded in 62.48 seconds [Rate: 104,774.62 per second]\n", + "19:15:34 INFO loader :: -- Finish quads load\n" + ] + } + ], + "source": [ + "!docker compose run fuseki ./apache-jena-4.9.0/bin/tdbloader --loc /fuseki-base/nmdc-db.tdb /fuseki-base/nmdc-db.nt.gz" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "1a0bfb4b-e694-40b1-88af-4446e3fcc888", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n", + " \u001b[32m✔\u001b[0m Container fuseki \u001b[32mCreated\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 1/1\u001b[0m\n", + " \u001b[32m✔\u001b[0m Container fuseki \u001b[32mStarted\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "!docker compose up fuseki -d" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nmdc-runtime", + "language": "python", + "name": "nmdc-runtime" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nmdc_runtime/fuseki.Dockerfile b/nmdc_runtime/fuseki.Dockerfile new file mode 100644 index 00000000..a6b60555 --- /dev/null +++ b/nmdc_runtime/fuseki.Dockerfile @@ -0,0 +1,30 @@ +# Use an appropriate base image that includes Java and wget +FROM openjdk:11-jre-slim + +# Set environment variables +ENV FUSEKI_VERSION 4.9.0 +ENV FUSEKI_HOME /fuseki + +# Install wget +RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/* + +# Download and extract Fuseki +RUN wget -qO- https://archive.apache.org/dist/jena/binaries/apache-jena-fuseki-$FUSEKI_VERSION.tar.gz | tar xvz -C / && \ + mv /apache-jena-fuseki-$FUSEKI_VERSION $FUSEKI_HOME + +# Expose the default port +EXPOSE 3030 + +# Download and extract Jena Commands +RUN wget -qO- https://archive.apache.org/dist/jena/binaries/apache-jena-$FUSEKI_VERSION.tar.gz | tar xvz -C / && \ + mv /apache-jena-$FUSEKI_VERSION $FUSEKI_HOME + +# Copy the Fuseki configuration file to the container +COPY ./nmdc_runtime/site/fuseki/fuseki-config.ttl $FUSEKI_HOME/configuration/ +COPY ./nmdc_runtime/site/fuseki/shiro.ini $FUSEKI_HOME/run/ + +# Set working directory +WORKDIR $FUSEKI_HOME + +# Command to start Fuseki server with preloaded data +CMD ["./fuseki-server", "--config", "configuration/fuseki-config.ttl"] \ No newline at end of file diff --git a/nmdc_runtime/site/fuseki/fuseki-config.ttl b/nmdc_runtime/site/fuseki/fuseki-config.ttl new file mode 100644 index 00000000..b7cd9f15 --- /dev/null +++ b/nmdc_runtime/site/fuseki/fuseki-config.ttl @@ -0,0 +1,32 @@ +@prefix : <#> . +@prefix fuseki: . +@prefix rdf: . +@prefix rdfs: . +@prefix ja: . +@prefix tdb: . +@prefix afn: . +@prefix nmdc: . + +[] rdf:type fuseki:Server ; + fuseki:services ( + <#nmdc> + ) . + +<#nmdc> rdf:type fuseki:Service ; + fuseki:name "nmdc" ; + fuseki:serviceQuery "query" ; + fuseki:serviceQuery "sparql" ; + fuseki:serviceUpdate "update" ; + fuseki:serviceUpload "upload" ; + fuseki:serviceReadWriteGraphStore "data" ; + fuseki:dataset <#dataset> . + +<#dataset> rdf:type tdb:DatasetTDB ; tdb:location "/fuseki-base/nmdc-db.tdb" . + +#<#dataset> rdf:type ja:RDFDataset ; +# ja:defaultGraph <#model> . +# +#<#model> rdf:type ja:MemoryModel ; +# . +# ja:content [ja:externalContent ] ; +# ja:content [ja:externalContent ] . diff --git a/nmdc_runtime/site/fuseki/shiro.ini b/nmdc_runtime/site/fuseki/shiro.ini new file mode 100644 index 00000000..284b4d1f --- /dev/null +++ b/nmdc_runtime/site/fuseki/shiro.ini @@ -0,0 +1,13 @@ +[main] +localhost=org.apache.jena.fuseki.authz.LocalhostFilter + +[urls] +## Control functions open to anyone +/$/server = anon +/$/ping = anon +/$/stats = anon +/$/stats/* = anon +## and the rest are restricted to localhost. +## See above for 'localhost' +/$/** = anon +/**=anon From cc9776d8e1a466d4f4393106bfb6b9ade37b5441 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Tue, 5 Mar 2024 14:28:21 -0500 Subject: [PATCH 02/18] fix: https uri --- nmdc_runtime/site/fuseki/fuseki-config.ttl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nmdc_runtime/site/fuseki/fuseki-config.ttl b/nmdc_runtime/site/fuseki/fuseki-config.ttl index b7cd9f15..49ad627f 100644 --- a/nmdc_runtime/site/fuseki/fuseki-config.ttl +++ b/nmdc_runtime/site/fuseki/fuseki-config.ttl @@ -5,7 +5,7 @@ @prefix ja: . @prefix tdb: . @prefix afn: . -@prefix nmdc: . +@prefix nmdc: . [] rdf:type fuseki:Server ; fuseki:services ( From 23b4a518c64d1a31b79829bea247e4b26d44ddd8 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Tue, 5 Mar 2024 16:25:15 -0500 Subject: [PATCH 03/18] fix: nmdc:type range is of type @id --- .../notebooks/ghissue_401_sparql.ipynb | 590 ++---------------- 1 file changed, 65 insertions(+), 525 deletions(-) diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb index b8504f40..fbe1738a 100644 --- a/metadata-translation/notebooks/ghissue_401_sparql.ipynb +++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "a362b42f-7ae0-40cf-91d4-8f19ca1087cf", "metadata": {}, "outputs": [], @@ -37,18 +37,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "55932d03-802f-4efe-bceb-e1036cd35567", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MONGO_HOST=localhost:27018\n" - ] - } - ], + "outputs": [], "source": [ "from dotenv import load_dotenv\n", "\n", @@ -66,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "edb1bb42-005c-49ca-ba59-18c24833f93f", "metadata": {}, "outputs": [], @@ -86,39 +78,10 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "3a0dd489-74cc-47c4-b3e0-c97dd88f5b5f", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['biosample_set',\n", - " 'data_object_set',\n", - " 'extraction_set',\n", - " 'field_research_site_set',\n", - " 'library_preparation_set',\n", - " 'mags_activity_set',\n", - " 'metabolomics_analysis_activity_set',\n", - " 'metagenome_annotation_activity_set',\n", - " 'metagenome_assembly_set',\n", - " 'metagenome_sequencing_activity_set',\n", - " 'metaproteomics_analysis_activity_set',\n", - " 'metatranscriptome_activity_set',\n", - " 'nom_analysis_activity_set',\n", - " 'omics_processing_set',\n", - " 'pooling_set',\n", - " 'processed_sample_set',\n", - " 'read_based_taxonomy_analysis_activity_set',\n", - " 'read_qc_analysis_activity_set',\n", - " 'study_set']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from nmdc_runtime.util import schema_collection_names_with_id_field\n", "\n", @@ -139,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "9ed72826-b552-4429-8ab5-9f7126821822", "metadata": {}, "outputs": [], @@ -155,12 +118,23 @@ "\n", "for k, v in list(context.items()):\n", " if isinstance(v, dict): #and v.get(\"@type\") == \"@id\":\n", - " v.pop(\"@id\", None) # use nmdc uri, not e.g. MIXS uri" + " v.pop(\"@id\", None) # use nmdc uri, not e.g. MIXS uri\n", + "pprint(context)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62a68c07-0706-4300-a48d-0ab628af87b1", + "metadata": {}, + "outputs": [], + "source": [ + "context['type'] = {'@type': '@id'}" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "id": "648b4f70-34d6-4c70-8d0a-ef76e7e5d96d", "metadata": {}, "outputs": [], @@ -172,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "id": "4d802017-2a7e-4614-b662-6a0cc027b8bc", "metadata": {}, "outputs": [], @@ -190,50 +164,10 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "id": "4251e0b1-35dc-4f40-91e7-b9bc0d9d79e1", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0ac75a239524499d901b1b5a25bc74f7", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/112 [00:00POS: 100,000 slots (Batch: 1,333,333 slots/s / Avg: 1,333,333 slots/s)\n", - "19:15:20 INFO loader :: Index SPO->POS: 200,000 slots (Batch: 952,380 slots/s / Avg: 1,111,111 slots/s)\n", - "19:15:20 INFO loader :: Index SPO->POS: 300,000 slots (Batch: 1,190,476 slots/s / Avg: 1,136,363 slots/s)\n", - "19:15:20 INFO loader :: Index SPO->POS: 400,000 slots (Batch: 1,123,595 slots/s / Avg: 1,133,144 slots/s)\n", - "19:15:20 INFO loader :: Index SPO->POS: 500,000 slots (Batch: 1,111,111 slots/s / Avg: 1,128,668 slots/s)\n", - "19:15:20 INFO loader :: Index SPO->POS: 600,000 slots (Batch: 862,068 slots/s / Avg: 1,073,345 slots/s)\n", - "19:15:20 INFO loader :: Index SPO->POS: 700,000 slots (Batch: 1,052,631 slots/s / Avg: 1,070,336 slots/s)\n", - "19:15:20 INFO loader :: Index SPO->POS: 800,000 slots (Batch: 1,176,470 slots/s / Avg: 1,082,543 slots/s)\n", - "19:15:20 INFO loader :: Index SPO->POS: 900,000 slots (Batch: 900,900 slots/s / Avg: 1,058,823 slots/s)\n", - "19:15:21 INFO loader :: Index SPO->POS: 1,000,000 slots (Batch: 1,149,425 slots/s / Avg: 1,067,235 slots/s)\n", - "19:15:21 INFO loader :: Elapsed: 49.21 seconds [2024/03/05 19:15:21 UTC]\n", - "19:15:21 INFO loader :: Index SPO->POS: 1,100,000 slots (Batch: 1,086,956 slots/s / Avg: 1,068,999 slots/s)\n", - "19:15:21 INFO loader :: Index SPO->POS: 1,200,000 slots (Batch: 840,336 slots/s / Avg: 1,045,296 slots/s)\n", - "19:15:21 INFO loader :: Index SPO->POS: 1,300,000 slots (Batch: 970,873 slots/s / Avg: 1,039,168 slots/s)\n", - "19:15:21 INFO loader :: Index SPO->POS: 1,400,000 slots (Batch: 970,873 slots/s / Avg: 1,033,973 slots/s)\n", - "19:15:21 INFO loader :: Index SPO->POS: 1,500,000 slots (Batch: 952,380 slots/s / Avg: 1,028,101 slots/s)\n", - "19:15:21 INFO loader :: Index SPO->POS: 1,600,000 slots (Batch: 826,446 slots/s / Avg: 1,012,658 slots/s)\n", - "19:15:21 INFO loader :: Index SPO->POS: 1,700,000 slots (Batch: 1,030,927 slots/s / Avg: 1,013,714 slots/s)\n", - "19:15:21 INFO loader :: Index SPO->POS: 1,800,000 slots (Batch: 1,010,101 slots/s / Avg: 1,013,513 slots/s)\n", - "19:15:21 INFO loader :: Index SPO->POS: 1,900,000 slots (Batch: 1,010,101 slots/s / Avg: 1,013,333 slots/s)\n", - "19:15:22 INFO loader :: Index SPO->POS: 2,000,000 slots (Batch: 781,250 slots/s / Avg: 998,502 slots/s)\n", - "19:15:22 INFO loader :: Elapsed: 50.27 seconds [2024/03/05 19:15:22 UTC]\n", - "19:15:22 INFO loader :: Index SPO->POS: 2,100,000 slots (Batch: 961,538 slots/s / Avg: 996,677 slots/s)\n", - "19:15:22 INFO loader :: Index SPO->POS: 2,200,000 slots (Batch: 970,873 slots/s / Avg: 995,475 slots/s)\n", - "19:15:22 INFO loader :: Index SPO->POS: 2,300,000 slots (Batch: 943,396 slots/s / Avg: 993,091 slots/s)\n", - "19:15:22 INFO loader :: Index SPO->POS: 2,400,000 slots (Batch: 740,740 slots/s / Avg: 979,192 slots/s)\n", - "19:15:22 INFO loader :: Index SPO->POS: 2,500,000 slots (Batch: 892,857 slots/s / Avg: 975,419 slots/s)\n", - "19:15:22 INFO loader :: Index SPO->POS: 2,600,000 slots (Batch: 877,192 slots/s / Avg: 971,236 slots/s)\n", - "19:15:22 INFO loader :: Index SPO->POS: 2,700,000 slots (Batch: 917,431 slots/s / Avg: 969,131 slots/s)\n", - "19:15:22 INFO loader :: Index SPO->POS: 2,800,000 slots (Batch: 781,250 slots/s / Avg: 960,878 slots/s)\n", - "19:15:23 INFO loader :: Index SPO->POS: 2,900,000 slots (Batch: 877,192 slots/s / Avg: 957,727 slots/s)\n", - "19:15:23 INFO loader :: Index SPO->POS: 3,000,000 slots (Batch: 847,457 slots/s / Avg: 953,591 slots/s)\n", - "19:15:23 INFO loader :: Elapsed: 51.41 seconds [2024/03/05 19:15:23 UTC]\n", - "19:15:23 INFO loader :: Index SPO->POS: 3,100,000 slots (Batch: 751,879 slots/s / Avg: 945,410 slots/s)\n", - "19:15:23 INFO loader :: Index SPO->POS: 3,200,000 slots (Batch: 884,955 slots/s / Avg: 943,396 slots/s)\n", - "19:15:23 INFO loader :: Index SPO->POS: 3,300,000 slots (Batch: 847,457 slots/s / Avg: 940,170 slots/s)\n", - "19:15:23 INFO loader :: Index SPO->POS: 3,400,000 slots (Batch: 877,192 slots/s / Avg: 938,189 slots/s)\n", - "19:15:23 INFO loader :: Index SPO->POS: 3,500,000 slots (Batch: 704,225 slots/s / Avg: 929,368 slots/s)\n", - "19:15:23 INFO loader :: Index SPO->POS: 3,600,000 slots (Batch: 819,672 slots/s / Avg: 925,925 slots/s)\n", - "19:15:24 INFO loader :: Index SPO->POS: 3,700,000 slots (Batch: 806,451 slots/s / Avg: 922,233 slots/s)\n", - "19:15:24 INFO loader :: Index SPO->POS: 3,800,000 slots (Batch: 847,457 slots/s / Avg: 920,096 slots/s)\n", - "19:15:24 INFO loader :: Index SPO->POS: 3,900,000 slots (Batch: 714,285 slots/s / Avg: 913,348 slots/s)\n", - "19:15:24 INFO loader :: Index SPO->POS: 4,000,000 slots (Batch: 840,336 slots/s / Avg: 911,369 slots/s)\n", - "19:15:24 INFO loader :: Elapsed: 52.66 seconds [2024/03/05 19:15:24 UTC]\n", - "19:15:24 INFO loader :: Index SPO->POS: 4,100,000 slots (Batch: 847,457 slots/s / Avg: 909,696 slots/s)\n", - "19:15:24 INFO loader :: Index SPO->POS: 4,200,000 slots (Batch: 847,457 slots/s / Avg: 908,108 slots/s)\n", - "19:15:24 INFO loader :: Index SPO->POS: 4,300,000 slots (Batch: 709,219 slots/s / Avg: 902,224 slots/s)\n", - "19:15:24 INFO loader :: Index SPO->POS: 4,400,000 slots (Batch: 854,700 slots/s / Avg: 901,085 slots/s)\n", - "19:15:25 INFO loader :: Index SPO->POS: 4,500,000 slots (Batch: 840,336 slots/s / Avg: 899,640 slots/s)\n", - "19:15:25 INFO loader :: Index SPO->POS: 4,600,000 slots (Batch: 813,008 slots/s / Avg: 897,560 slots/s)\n", - "19:15:25 INFO loader :: Index SPO->POS: 4,700,000 slots (Batch: 724,637 slots/s / Avg: 893,026 slots/s)\n", - "19:15:25 INFO loader :: Index SPO->POS: 4,800,000 slots (Batch: 833,333 slots/s / Avg: 891,696 slots/s)\n", - "19:15:25 INFO loader :: Index SPO->POS: 4,900,000 slots (Batch: 833,333 slots/s / Avg: 890,423 slots/s)\n", - "19:15:25 INFO loader :: Index SPO->POS: 5,000,000 slots (Batch: 781,250 slots/s / Avg: 887,941 slots/s)\n", - "19:15:25 INFO loader :: Elapsed: 53.90 seconds [2024/03/05 19:15:25 UTC]\n", - "19:15:25 INFO loader :: Index SPO->POS: 5,100,000 slots (Batch: 694,444 slots/s / Avg: 883,116 slots/s)\n", - "19:15:25 INFO loader :: Index SPO->POS: 5,200,000 slots (Batch: 763,358 slots/s / Avg: 880,460 slots/s)\n", - "19:15:26 INFO loader :: Index SPO->POS: 5,300,000 slots (Batch: 757,575 slots/s / Avg: 877,774 slots/s)\n", - "19:15:26 INFO loader :: Index SPO->POS: 5,400,000 slots (Batch: 746,268 slots/s / Avg: 874,918 slots/s)\n", - "19:15:26 INFO loader :: Index SPO->POS: 5,500,000 slots (Batch: 704,225 slots/s / Avg: 871,080 slots/s)\n", - "19:15:26 INFO loader :: Index SPO->POS: 5,600,000 slots (Batch: 793,650 slots/s / Avg: 869,565 slots/s)\n", - "19:15:26 INFO loader :: Index SPO->POS: 5,700,000 slots (Batch: 787,401 slots/s / Avg: 867,976 slots/s)\n", - "19:15:26 INFO loader :: Index SPO->POS: 5,800,000 slots (Batch: 800,000 slots/s / Avg: 866,706 slots/s)\n", - "19:15:26 INFO loader :: Index SPO->POS: 5,900,000 slots (Batch: 787,401 slots/s / Avg: 865,229 slots/s)\n", - "19:15:27 INFO loader :: Index SPO->POS: 6,000,000 slots (Batch: 813,008 slots/s / Avg: 864,304 slots/s)\n", - "19:15:27 INFO loader :: Elapsed: 55.21 seconds [2024/03/05 19:15:27 UTC]\n", - "19:15:27 INFO loader :: Index SPO->POS: 6,100,000 slots (Batch: 769,230 slots/s / Avg: 862,556 slots/s)\n", - "19:15:27 INFO loader :: Index SPO->POS: 6,200,000 slots (Batch: 787,401 slots/s / Avg: 861,230 slots/s)\n", - "19:15:27 INFO loader :: Index SPO->POS: 6,300,000 slots (Batch: 781,250 slots/s / Avg: 859,833 slots/s)\n", - "19:15:27 INFO loader :: Index SPO->POS: 6,400,000 slots (Batch: 806,451 slots/s / Avg: 858,945 slots/s)\n", - "19:15:27 INFO loader :: Index SPO->POS: 6,500,000 slots (Batch: 819,672 slots/s / Avg: 858,312 slots/s)\n", - "19:15:27 INFO loader :: ** Index SPO->POS: 6,546,004 slots indexed in 7.63 seconds [Rate: 858,042.19 per second]\n", - "19:15:27 INFO loader :: Index SPO->OSP: 100,000 slots (Batch: 1,923,076 slots/s / Avg: 1,923,076 slots/s)\n", - "19:15:27 INFO loader :: Index SPO->OSP: 200,000 slots (Batch: 1,538,461 slots/s / Avg: 1,709,401 slots/s)\n", - "19:15:27 INFO loader :: Index SPO->OSP: 300,000 slots (Batch: 1,538,461 slots/s / Avg: 1,648,351 slots/s)\n", - "19:15:27 INFO loader :: Index SPO->OSP: 400,000 slots (Batch: 1,449,275 slots/s / Avg: 1,593,625 slots/s)\n", - "19:15:28 INFO loader :: Index SPO->OSP: 500,000 slots (Batch: 1,351,351 slots/s / Avg: 1,538,461 slots/s)\n", - "19:15:28 INFO loader :: Index SPO->OSP: 600,000 slots (Batch: 1,250,000 slots/s / Avg: 1,481,481 slots/s)\n", - "19:15:28 INFO loader :: Index SPO->OSP: 700,000 slots (Batch: 1,219,512 slots/s / Avg: 1,437,371 slots/s)\n", - "19:15:28 INFO loader :: Index SPO->OSP: 800,000 slots (Batch: 1,063,829 slots/s / Avg: 1,376,936 slots/s)\n", - "19:15:28 INFO loader :: Index SPO->OSP: 900,000 slots (Batch: 1,075,268 slots/s / Avg: 1,335,311 slots/s)\n", - "19:15:28 INFO loader :: Index SPO->OSP: 1,000,000 slots (Batch: 1,075,268 slots/s / Avg: 1,303,780 slots/s)\n", - "19:15:28 INFO loader :: Elapsed: 56.66 seconds [2024/03/05 19:15:28 UTC]\n", - "19:15:28 INFO loader :: Index SPO->OSP: 1,100,000 slots (Batch: 1,098,901 slots/s / Avg: 1,282,051 slots/s)\n", - "19:15:28 INFO loader :: Index SPO->OSP: 1,200,000 slots (Batch: 1,111,111 slots/s / Avg: 1,265,822 slots/s)\n", - "19:15:28 INFO loader :: Index SPO->OSP: 1,300,000 slots (Batch: 1,136,363 slots/s / Avg: 1,254,826 slots/s)\n", - "19:15:28 INFO loader :: Index SPO->OSP: 1,400,000 slots (Batch: 1,030,927 slots/s / Avg: 1,235,657 slots/s)\n", - "19:15:28 INFO loader :: Index SPO->OSP: 1,500,000 slots (Batch: 1,086,956 slots/s / Avg: 1,224,489 slots/s)\n", - "19:15:29 INFO loader :: Index SPO->OSP: 1,600,000 slots (Batch: 1,086,956 slots/s / Avg: 1,214,882 slots/s)\n", - "19:15:29 INFO loader :: Index SPO->OSP: 1,700,000 slots (Batch: 1,000,000 slots/s / Avg: 1,199,717 slots/s)\n", - "19:15:29 INFO loader :: Index SPO->OSP: 1,800,000 slots (Batch: 1,030,927 slots/s / Avg: 1,188,903 slots/s)\n", - "19:15:29 INFO loader :: Index SPO->OSP: 1,900,000 slots (Batch: 1,010,101 slots/s / Avg: 1,177,929 slots/s)\n", - "19:15:29 INFO loader :: Index SPO->OSP: 2,000,000 slots (Batch: 990,099 slots/s / Avg: 1,166,861 slots/s)\n", - "19:15:29 INFO loader :: Elapsed: 57.61 seconds [2024/03/05 19:15:29 UTC]\n", - "19:15:29 INFO loader :: Index SPO->OSP: 2,100,000 slots (Batch: 1,030,927 slots/s / Avg: 1,159,580 slots/s)\n", - "19:15:29 INFO loader :: Index SPO->OSP: 2,200,000 slots (Batch: 1,010,101 slots/s / Avg: 1,151,832 slots/s)\n", - "19:15:29 INFO loader :: Index SPO->OSP: 2,300,000 slots (Batch: 1,010,101 slots/s / Avg: 1,144,848 slots/s)\n", - "19:15:29 INFO loader :: Index SPO->OSP: 2,400,000 slots (Batch: 990,099 slots/s / Avg: 1,137,440 slots/s)\n", - "19:15:29 INFO loader :: Index SPO->OSP: 2,500,000 slots (Batch: 990,099 slots/s / Avg: 1,130,710 slots/s)\n", - "19:15:30 INFO loader :: Index SPO->OSP: 2,600,000 slots (Batch: 1,030,927 slots/s / Avg: 1,126,516 slots/s)\n", - "19:15:30 INFO loader :: Index SPO->OSP: 2,700,000 slots (Batch: 970,873 slots/s / Avg: 1,119,867 slots/s)\n", - "19:15:30 INFO loader :: Index SPO->OSP: 2,800,000 slots (Batch: 961,538 slots/s / Avg: 1,113,320 slots/s)\n", - "19:15:30 INFO loader :: Index SPO->OSP: 2,900,000 slots (Batch: 961,538 slots/s / Avg: 1,107,292 slots/s)\n", - "19:15:30 INFO loader :: Index SPO->OSP: 3,000,000 slots (Batch: 1,010,101 slots/s / Avg: 1,103,752 slots/s)\n", - "19:15:30 INFO loader :: Elapsed: 58.62 seconds [2024/03/05 19:15:30 UTC]\n", - "19:15:30 INFO loader :: Index SPO->OSP: 3,100,000 slots (Batch: 1,010,101 slots/s / Avg: 1,100,461 slots/s)\n", - "19:15:30 INFO loader :: Index SPO->OSP: 3,200,000 slots (Batch: 952,380 slots/s / Avg: 1,095,140 slots/s)\n", - "19:15:30 INFO loader :: Index SPO->OSP: 3,300,000 slots (Batch: 961,538 slots/s / Avg: 1,090,548 slots/s)\n", - "19:15:30 INFO loader :: Index SPO->OSP: 3,400,000 slots (Batch: 952,380 slots/s / Avg: 1,085,915 slots/s)\n", - "19:15:30 INFO loader :: Index SPO->OSP: 3,500,000 slots (Batch: 1,010,101 slots/s / Avg: 1,083,591 slots/s)\n", - "19:15:31 INFO loader :: Index SPO->OSP: 3,600,000 slots (Batch: 990,099 slots/s / Avg: 1,080,756 slots/s)\n", - "19:15:31 INFO loader :: Index SPO->OSP: 3,700,000 slots (Batch: 934,579 slots/s / Avg: 1,076,207 slots/s)\n", - "19:15:31 INFO loader :: Index SPO->OSP: 3,800,000 slots (Batch: 952,380 slots/s / Avg: 1,072,537 slots/s)\n", - "19:15:31 INFO loader :: Index SPO->OSP: 3,900,000 slots (Batch: 934,579 slots/s / Avg: 1,068,493 slots/s)\n", - "19:15:31 INFO loader :: Index SPO->OSP: 4,000,000 slots (Batch: 970,873 slots/s / Avg: 1,065,814 slots/s)\n", - "19:15:31 INFO loader :: Elapsed: 59.65 seconds [2024/03/05 19:15:31 UTC]\n", - "19:15:31 INFO loader :: Index SPO->OSP: 4,100,000 slots (Batch: 970,873 slots/s / Avg: 1,063,278 slots/s)\n", - "19:15:31 INFO loader :: Index SPO->OSP: 4,200,000 slots (Batch: 952,380 slots/s / Avg: 1,060,338 slots/s)\n", - "19:15:31 INFO loader :: Index SPO->OSP: 4,300,000 slots (Batch: 917,431 slots/s / Avg: 1,056,511 slots/s)\n", - "19:15:31 INFO loader :: Index SPO->OSP: 4,400,000 slots (Batch: 952,380 slots/s / Avg: 1,053,892 slots/s)\n", - "19:15:31 INFO loader :: Index SPO->OSP: 4,500,000 slots (Batch: 970,873 slots/s / Avg: 1,051,893 slots/s)\n", - "19:15:32 INFO loader :: Index SPO->OSP: 4,600,000 slots (Batch: 909,090 slots/s / Avg: 1,048,313 slots/s)\n", - "19:15:32 INFO loader :: Index SPO->OSP: 4,700,000 slots (Batch: 909,090 slots/s / Avg: 1,044,908 slots/s)\n", - "19:15:32 INFO loader :: Index SPO->OSP: 4,800,000 slots (Batch: 909,090 slots/s / Avg: 1,041,666 slots/s)\n", - "19:15:32 INFO loader :: Index SPO->OSP: 4,900,000 slots (Batch: 917,431 slots/s / Avg: 1,038,795 slots/s)\n", - "19:15:32 INFO loader :: Index SPO->OSP: 5,000,000 slots (Batch: 877,192 slots/s / Avg: 1,034,982 slots/s)\n", - "19:15:32 INFO loader :: Elapsed: 60.73 seconds [2024/03/05 19:15:32 UTC]\n", - "19:15:32 INFO loader :: Index SPO->OSP: 5,100,000 slots (Batch: 869,565 slots/s / Avg: 1,031,136 slots/s)\n", - "19:15:32 INFO loader :: Index SPO->OSP: 5,200,000 slots (Batch: 909,090 slots/s / Avg: 1,028,481 slots/s)\n", - "19:15:32 INFO loader :: Index SPO->OSP: 5,300,000 slots (Batch: 869,565 slots/s / Avg: 1,024,946 slots/s)\n", - "19:15:32 INFO loader :: Index SPO->OSP: 5,400,000 slots (Batch: 854,700 slots/s / Avg: 1,021,180 slots/s)\n", - "19:15:33 INFO loader :: Index SPO->OSP: 5,500,000 slots (Batch: 854,700 slots/s / Avg: 1,017,576 slots/s)\n", - "19:15:33 INFO loader :: Index SPO->OSP: 5,600,000 slots (Batch: 854,700 slots/s / Avg: 1,014,125 slots/s)\n", - "19:15:33 INFO loader :: Index SPO->OSP: 5,700,000 slots (Batch: 862,068 slots/s / Avg: 1,010,996 slots/s)\n", - "19:15:33 INFO loader :: Index SPO->OSP: 5,800,000 slots (Batch: 934,579 slots/s / Avg: 1,009,573 slots/s)\n", - "19:15:33 INFO loader :: Index SPO->OSP: 5,900,000 slots (Batch: 892,857 slots/s / Avg: 1,007,341 slots/s)\n", - "19:15:33 INFO loader :: Index SPO->OSP: 6,000,000 slots (Batch: 892,857 slots/s / Avg: 1,005,193 slots/s)\n", - "19:15:33 INFO loader :: Elapsed: 61.87 seconds [2024/03/05 19:15:33 UTC]\n", - "19:15:33 INFO loader :: Index SPO->OSP: 6,100,000 slots (Batch: 884,955 slots/s / Avg: 1,002,959 slots/s)\n", - "19:15:33 INFO loader :: Index SPO->OSP: 6,200,000 slots (Batch: 900,900 slots/s / Avg: 1,001,130 slots/s)\n", - "19:15:34 INFO loader :: Index SPO->OSP: 6,300,000 slots (Batch: 884,955 slots/s / Avg: 999,048 slots/s)\n", - "19:15:34 INFO loader :: Index SPO->OSP: 6,400,000 slots (Batch: 892,857 slots/s / Avg: 997,195 slots/s)\n", - "19:15:34 INFO loader :: Index SPO->OSP: 6,500,000 slots (Batch: 892,857 slots/s / Avg: 995,405 slots/s)\n", - "19:15:34 INFO loader :: ** Index SPO->OSP: 6,546,004 slots indexed in 6.58 seconds [Rate: 994,833.44 per second]\n", - "19:15:34 INFO loader :: -- Finish triples index phase\n", - "19:15:34 INFO loader :: ** 6,546,004 triples indexed in 14.21 seconds [Rate: 460,629.38 per second]\n", - "19:15:34 INFO loader :: -- Finish triples load\n", - "19:15:34 INFO loader :: ** Completed: 6,546,004 triples loaded in 62.48 seconds [Rate: 104,774.62 per second]\n", - "19:15:34 INFO loader :: -- Finish quads load\n" - ] - } - ], + "metadata": {}, + "outputs": [], "source": [ "!docker compose run fuseki ./apache-jena-4.9.0/bin/tdbloader --loc /fuseki-base/nmdc-db.tdb /fuseki-base/nmdc-db.nt.gz" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": null, "id": "1a0bfb4b-e694-40b1-88af-4446e3fcc888", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n", - " \u001b[32m✔\u001b[0m Container fuseki \u001b[32mCreated\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 1/1\u001b[0m\n", - " \u001b[32m✔\u001b[0m Container fuseki \u001b[32mStarted\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h" - ] - } - ], + "outputs": [], "source": [ "!docker compose up fuseki -d" ] From 7ad4e90afc91152a9247ff460394b4f6f65ec7d3 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Wed, 6 Mar 2024 11:49:35 -0500 Subject: [PATCH 04/18] feat: provide for reasoning during graph search --- .../notebooks/ghissue_401_sparql.ipynb | 161 ++++++++++++++++-- nmdc_runtime/site/fuseki/fuseki-config.ttl | 82 ++++++--- 2 files changed, 201 insertions(+), 42 deletions(-) diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb index fbe1738a..9b21487b 100644 --- a/metadata-translation/notebooks/ghissue_401_sparql.ipynb +++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb @@ -5,7 +5,7 @@ "id": "51ea05af-7579-43ad-aa9c-3bf8b6da8fdb", "metadata": {}, "source": [ - "Pipeline to transform the set of nmdc-schema-compliant mongodb collections to an RDF dataset amenable to SPARQL queries." + "# Pipeline to transform the set of nmdc-schema-compliant mongodb collections to an RDF dataset amenable to SPARQL queries." ] }, { @@ -122,6 +122,14 @@ "pprint(context)" ] }, + { + "cell_type": "markdown", + "id": "0800c5b9-d09e-4be1-899d-62fcf40a2c0e", + "metadata": {}, + "source": [ + "Ensure `nmdc:type` has a `URIRef` range, i.e. `nmdc:type a owl:ObjectProperty`." + ] + }, { "cell_type": "code", "execution_count": null, @@ -132,6 +140,14 @@ "context['type'] = {'@type': '@id'}" ] }, + { + "cell_type": "markdown", + "id": "63fe4d54-0a41-4170-9310-45e5f47a6cb5", + "metadata": {}, + "source": [ + "Initialize an in-memory graph to store triples, prior to serializing to disk." + ] + }, { "cell_type": "code", "execution_count": null, @@ -144,6 +160,14 @@ "g = Graph()" ] }, + { + "cell_type": "markdown", + "id": "05cb8fd0-b847-49fc-a472-a8df2426168a", + "metadata": {}, + "source": [ + "Define a helper function to speed up triplification process." + ] + }, { "cell_type": "code", "execution_count": null, @@ -162,6 +186,14 @@ " yield seq[i : i + n]" ] }, + { + "cell_type": "markdown", + "id": "dfd91d37-b1c7-46ab-b30d-de80132ec091", + "metadata": {}, + "source": [ + "Use `rdflib` JSON-LD parsing to ingest mongo docs to in-memory graph." + ] + }, { "cell_type": "code", "execution_count": null, @@ -184,17 +216,16 @@ " for chunk in chunks:\n", " doc_jsonld = {\"@context\": context, \"@graph\": chunk}\n", " g.parse(data=json.dumps(doc_jsonld), format='json-ld')\n", - " pbar.update(1)" + " pbar.update(1)\n", + "print(f\"{len(g):,} triples loaded\")" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "d25e85c4-b695-4a03-beaf-f34f5e73b66d", + "cell_type": "markdown", + "id": "7140ef42-f94c-45c5-a0c1-31b05718aa4f", "metadata": {}, - "outputs": [], "source": [ - "print(f\"{len(g):,}\")" + "Correct crazy URIs that end with newlines, which messes up graph serialization." ] }, { @@ -222,6 +253,75 @@ " g.add((s, p, URIRef(o_str_fixed)))" ] }, + { + "cell_type": "markdown", + "id": "71893efc-8e19-465e-a33d-3fe6ee475e05", + "metadata": {}, + "source": [ + "Now, we want to add OWL axioms to support fetching all \"top-level\" schema collection objects connected to a given schema collection object. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d402b739-4ab8-4d93-b00f-76f677313c66", + "metadata": {}, + "outputs": [], + "source": [ + "from linkml_runtime.utils.schemaview import SchemaView\n", + "\n", + "from nmdc_runtime.util import nmdc_schema_view, nmdc_database_collection_instance_class_names\n", + "\n", + "schema_view = nmdc_schema_view()\n", + "slots = schema_view.all_slots()\n", + "\n", + "collection_instance_class_names = nmdc_database_collection_instance_class_names()\n", + "\n", + "toplevel_object_connectors = set()\n", + "for k, v in context.items():\n", + " if isinstance(v, dict) and \"@type\" in v and v[\"@type\"] == \"@id\":\n", + " if slots[k].range in toplevel_objects and slots[k].domain != \"Database\":\n", + " toplevel_object_connectors.add(k)\n", + "print(toplevel_object_connectors)" + ] + }, + { + "cell_type": "markdown", + "id": "63cb2cc8-ef99-4d5f-9ddf-9eb2949e9c06", + "metadata": {}, + "source": [ + "Let's invent a symmetric, transitive property so that an OWL reasoner connected to our triplestore can help us traverse the graph without needing to know any specific property names." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc830d77-5ac2-482e-a4f9-dc2eed3f2ef9", + "metadata": {}, + "outputs": [], + "source": [ + "from rdflib import PROV, RDFS, RDF, OWL\n", + "\n", + "superprop = URIRef(\"https://api.microbiomedata.org/fuseki/#connected\")\n", + "g.add((superprop, RDF.type, OWL.SymmetricProperty))\n", + "g.add((superprop, RDF.type, OWL.TransitiveProperty))\n", + "\n", + "\n", + "for suffix in toplevel_object_connectors:\n", + " prop = URIRef(\"https://w3id.org/nmdc/\" + suffix)\n", + " g.add((prop, RDFS.subClassOf, superprop))\n", + "\n", + "print(f\"{len(g):,} triples in total\")" + ] + }, + { + "cell_type": "markdown", + "id": "91171cf6-f435-4815-970f-a67f51254997", + "metadata": {}, + "source": [ + "Serialize and store as gzipped N-Triples file." + ] + }, { "cell_type": "code", "execution_count": null, @@ -250,17 +350,16 @@ "metadata": {}, "outputs": [], "source": [ - "!docker volume rm nmdc-runtime_nmdc_runtime_fuseki_data" + "!docker compose up fuseki -d\n", + "!docker exec fuseki rm -rf /fuseki-base/nmdc-db.tdb" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "8fa9f685-e981-48ab-a434-57057fd39b8b", + "cell_type": "markdown", + "id": "79284de7-ef52-47c6-aeb1-1453bd4b5f59", "metadata": {}, - "outputs": [], "source": [ - "!docker compose up fuseki -d" + "Ensure data is present to load." ] }, { @@ -273,6 +372,14 @@ "!docker cp data/nmdc-db.nt.gz fuseki:/fuseki-base/" ] }, + { + "cell_type": "markdown", + "id": "4dca86f8-6752-4aba-8d3c-656810f3af3f", + "metadata": {}, + "source": [ + "Take server down in order to bulk-load data." + ] + }, { "cell_type": "code", "execution_count": null, @@ -283,16 +390,34 @@ "!docker compose down fuseki" ] }, + { + "cell_type": "markdown", + "id": "fa4f9843-d5c0-4f8d-bcaf-ad2cf50c0264", + "metadata": {}, + "source": [ + "Bulk-load data." + ] + }, { "cell_type": "code", "execution_count": null, "id": "a490caff-af8a-4537-8c0b-e4a4752645bc", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "!docker compose run fuseki ./apache-jena-4.9.0/bin/tdbloader --loc /fuseki-base/nmdc-db.tdb /fuseki-base/nmdc-db.nt.gz" ] }, + { + "cell_type": "markdown", + "id": "69d0e50c-102a-4a8e-9bcd-ef23600afd66", + "metadata": {}, + "source": [ + "Start up server." + ] + }, { "cell_type": "code", "execution_count": null, @@ -302,6 +427,14 @@ "source": [ "!docker compose up fuseki -d" ] + }, + { + "cell_type": "markdown", + "id": "8e528d6a-76b1-4629-82a1-58793ad6a481", + "metadata": {}, + "source": [ + "Now go to and SPARQL it up." + ] } ], "metadata": { diff --git a/nmdc_runtime/site/fuseki/fuseki-config.ttl b/nmdc_runtime/site/fuseki/fuseki-config.ttl index 49ad627f..1ea9ad41 100644 --- a/nmdc_runtime/site/fuseki/fuseki-config.ttl +++ b/nmdc_runtime/site/fuseki/fuseki-config.ttl @@ -1,32 +1,58 @@ -@prefix : <#> . -@prefix fuseki: . -@prefix rdf: . -@prefix rdfs: . -@prefix ja: . -@prefix tdb: . -@prefix afn: . -@prefix nmdc: . +@prefix afn: . +@prefix fuseki: . +@prefix ja: . +@prefix nmdc: . +@prefix owl: . +@prefix rdf: . +@prefix rdfs: . +@prefix tdb: . +@prefix xs: . -[] rdf:type fuseki:Server ; - fuseki:services ( - <#nmdc> - ) . + + a tdb:GraphTDB ; + tdb:dataset ; + . -<#nmdc> rdf:type fuseki:Service ; - fuseki:name "nmdc" ; - fuseki:serviceQuery "query" ; - fuseki:serviceQuery "sparql" ; - fuseki:serviceUpdate "update" ; - fuseki:serviceUpload "upload" ; - fuseki:serviceReadWriteGraphStore "data" ; - fuseki:dataset <#dataset> . + + a ja:RDFDataset ; + ja:defaultGraph ; + . -<#dataset> rdf:type tdb:DatasetTDB ; tdb:location "/fuseki-base/nmdc-db.tdb" . + + a ja:InfModel ; + ja:baseModel ; + ja:reasoner [ + ja:reasonerURL ; + ] ; + . + + + a fuseki:Service ; + fuseki:dataset ; + fuseki:name "nmdc" ; + fuseki:serviceQuery + "query" , + "sparql" + ; + fuseki:serviceReadWriteGraphStore "data" ; + fuseki:serviceUpdate "update" ; + fuseki:serviceUpload "upload" ; + . + + + a tdb:DatasetTDB ; + ja:context [ + rdfs:comment "Query timeout on this dataset: 10s." ; + ja:cxtName "arq:queryTimeout" ; + ja:cxtValue "10000" ; + ] ; + tdb:location "/fuseki-base/nmdc-db.tdb" ; + . + +[] + a fuseki:Server ; + fuseki:services ( + + ) ; + . -#<#dataset> rdf:type ja:RDFDataset ; -# ja:defaultGraph <#model> . -# -#<#model> rdf:type ja:MemoryModel ; -# . -# ja:content [ja:externalContent ] ; -# ja:content [ja:externalContent ] . From 58fd505e60a01ec223ce8efb2d1dea8a7a00c8aa Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Thu, 7 Mar 2024 15:20:24 -0500 Subject: [PATCH 05/18] feat(graph): run-through of ETL with prov:wasInfluencedBy reification --- .../notebooks/ghissue_401_sparql.ipynb | 1872 ++++++++++++++++- 1 file changed, 1825 insertions(+), 47 deletions(-) diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb index 9b21487b..ef4fab18 100644 --- a/metadata-translation/notebooks/ghissue_401_sparql.ipynb +++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "a362b42f-7ae0-40cf-91d4-8f19ca1087cf", "metadata": {}, "outputs": [], @@ -37,10 +37,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "55932d03-802f-4efe-bceb-e1036cd35567", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MONGO_HOST=localhost:27018\n" + ] + } + ], "source": [ "from dotenv import load_dotenv\n", "\n", @@ -58,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "edb1bb42-005c-49ca-ba59-18c24833f93f", "metadata": {}, "outputs": [], @@ -78,10 +86,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "3a0dd489-74cc-47c4-b3e0-c97dd88f5b5f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['biosample_set',\n", + " 'data_object_set',\n", + " 'extraction_set',\n", + " 'field_research_site_set',\n", + " 'library_preparation_set',\n", + " 'mags_activity_set',\n", + " 'metabolomics_analysis_activity_set',\n", + " 'metagenome_annotation_activity_set',\n", + " 'metagenome_assembly_set',\n", + " 'metagenome_sequencing_activity_set',\n", + " 'metaproteomics_analysis_activity_set',\n", + " 'metatranscriptome_activity_set',\n", + " 'nom_analysis_activity_set',\n", + " 'omics_processing_set',\n", + " 'pooling_set',\n", + " 'processed_sample_set',\n", + " 'read_based_taxonomy_analysis_activity_set',\n", + " 'read_qc_analysis_activity_set',\n", + " 'study_set']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from nmdc_runtime.util import schema_collection_names_with_id_field\n", "\n", @@ -102,10 +139,1188 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "9ed72826-b552-4429-8ab5-9f7126821822", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'@vocab': 'https://w3id.org/nmdc/',\n", + " 'CATH': 'https://bioregistry.io/cath:',\n", + " 'CHEBI': {'@prefix': True},\n", + " 'CHEMBL.COMPOUND': 'https://bioregistry.io/chembl.compound:',\n", + " 'CHMO': {'@prefix': True},\n", + " 'COG': 'https://bioregistry.io/cog:',\n", + " 'Contaminant': 'http://example.org/contaminant/',\n", + " 'CreditAssociation': {},\n", + " 'DRUGBANK': 'https://bioregistry.io/drugbank:',\n", + " 'EC': 'https://bioregistry.io/eccode:',\n", + " 'EFO': 'http://www.ebi.ac.uk/efo/',\n", + " 'EGGNOG': 'https://bioregistry.io/eggnog:',\n", + " 'ENVO': {'@prefix': True},\n", + " 'FBcv': {'@prefix': True},\n", + " 'FMA': {'@prefix': True},\n", + " 'GO': {'@prefix': True},\n", + " 'HMDB': 'https://bioregistry.io/hmdb:',\n", + " 'ISA': 'http://example.org/isa/',\n", + " 'KEGG.COMPOUND': 'https://bioregistry.io/kegg.compound:',\n", + " 'KEGG.ORTHOLOGY': 'https://bioregistry.io/kegg.orthology:',\n", + " 'KEGG.REACTION': 'https://bioregistry.io/kegg.reaction:',\n", + " 'KEGG_PATHWAY': 'https://bioregistry.io/kegg.pathway:',\n", + " 'MASSIVE': 'https://bioregistry.io/reference/massive:',\n", + " 'MESH': 'https://bioregistry.io/mesh:',\n", + " 'MIXS': 'https://w3id.org/mixs/',\n", + " 'MIXS_yaml': 'https://raw.githubusercontent.com/microbiomedata/mixs/main/model/schema/',\n", + " 'MS': {'@prefix': True},\n", + " 'MetaCyc': 'https://bioregistry.io/metacyc.compound:',\n", + " 'MetaNetX': 'http://example.org/metanetx/',\n", + " 'NCBITaxon': {'@prefix': True},\n", + " 'NCIT': {'@prefix': True},\n", + " 'OBI': {'@prefix': True},\n", + " 'ORCID': 'https://orcid.org/',\n", + " 'PANTHER.FAMILY': 'https://bioregistry.io/panther.family:',\n", + " 'PATO': {'@prefix': True},\n", + " 'PFAM': 'https://bioregistry.io/pfam:',\n", + " 'PO': {'@prefix': True},\n", + " 'PR': {'@prefix': True},\n", + " 'PUBCHEM.COMPOUND': 'https://bioregistry.io/pubchem.compound:',\n", + " 'PlannedProcess': {},\n", + " 'RHEA': 'https://bioregistry.io/rhea:',\n", + " 'RO': {'@prefix': True},\n", + " 'RetroRules': 'http://example.org/retrorules/',\n", + " 'SEED': 'https://bioregistry.io/seed:',\n", + " 'SIO': {'@prefix': True},\n", + " 'SUPFAM': 'https://bioregistry.io/supfam:',\n", + " 'TIGRFAM': 'https://bioregistry.io/tigrfam:',\n", + " 'UBERON': {'@prefix': True},\n", + " 'UO': {'@prefix': True},\n", + " 'UniProtKB': 'https://bioregistry.io/uniprot:',\n", + " 'abs_air_humidity': {'@type': '@id'},\n", + " 'activity_set': {'@type': '@id'},\n", + " 'add_recov_method': {'@type': '@id'},\n", + " 'additional_info': {'@type': '@id'},\n", + " 'address': {'@type': '@id'},\n", + " 'adj_room': {'@type': '@id'},\n", + " 'aero_struc': {'@type': '@id'},\n", + " 'agrochem_addition': {'@type': '@id'},\n", + " 'air_PM_concen': {'@type': '@id'},\n", + " 'air_temp': {'@type': '@id'},\n", + " 'air_temp_regm': {'@type': '@id'},\n", + " 'al_sat': {'@type': '@id'},\n", + " 'al_sat_meth': {'@type': '@id'},\n", + " 'alkalinity': {'@type': '@id'},\n", + " 'alkalinity_method': {'@type': '@id'},\n", + " 'alkyl_diethers': {'@type': '@id'},\n", + " 'all_proteins': {'@type': '@id'},\n", + " 'alt': {'@type': '@id'},\n", + " 'alternative_identifiers': {'@type': '@id'},\n", + " 'aminopept_act': {'@type': '@id'},\n", + " 'ammonium': {'@type': '@id'},\n", + " 'ammonium_nitrogen': {'@type': '@id'},\n", + " 'amount_light': {'@type': '@id'},\n", + " 'analysis_identifiers': {'@type': '@id'},\n", + " 'analysis_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'ances_data': {'@type': '@id'},\n", + " 'annual_precpt': {'@type': '@id'},\n", + " 'annual_temp': {'@type': '@id'},\n", + " 'antibiotic_regm': {'@type': '@id'},\n", + " 'api': {'@type': '@id'},\n", + " 'applied_roles': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'applies_to_person': {'@type': '@id'},\n", + " 'arch_struc': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'aromatics_pc': {'@type': '@id'},\n", + " 'asm_score': {'@type': 'xsd:float'},\n", + " 'asphaltenes_pc': {'@type': '@id'},\n", + " 'associated_dois': {'@type': '@id'},\n", + " 'atmospheric_data': {'@type': '@id'},\n", + " 'avg_dew_point': {'@type': '@id'},\n", + " 'avg_occup': {'@type': '@id'},\n", + " 'avg_temp': {'@type': '@id'},\n", + " 'bac_prod': {'@type': '@id'},\n", + " 'bac_resp': {'@type': '@id'},\n", + " 'bacteria_carb_prod': {'@type': '@id'},\n", + " 'barometric_press': {'@type': '@id'},\n", + " 'basin': {'@type': '@id'},\n", + " 'bathroom_count': {'@type': '@id'},\n", + " 'bedroom_count': {'@type': '@id'},\n", + " 'benzene': {'@type': '@id'},\n", + " 'best_protein': {'@type': '@id'},\n", + " 'binned_contig_num': {'@type': 'xsd:integer'},\n", + " 'biochem_oxygen_dem': {'@type': '@id'},\n", + " 'biocide': {'@type': '@id'},\n", + " 'biocide_admin_method': {'@type': '@id'},\n", + " 'biol_stat': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'biolink': 'https://w3id.org/biolink/vocab/',\n", + " 'biomass': {'@type': '@id'},\n", + " 'biomaterial_purity': {'@type': '@id'},\n", + " 'bioproject': 'https://identifiers.org/bioproject:',\n", + " 'biosample': 'https://bioregistry.io/biosample:',\n", + " 'biosample_categories': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'biosample_identifiers': {'@type': '@id'},\n", + " 'biosample_set': {'@type': '@id'},\n", + " 'biotic_regm': {'@type': '@id'},\n", + " 'biotic_relationship': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'bishomohopanol': {'@type': '@id'},\n", + " 'blood_press_diast': {'@type': '@id'},\n", + " 'blood_press_syst': {'@type': '@id'},\n", + " 'bromide': {'@type': '@id'},\n", + " 'build_docs': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'build_occup_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'building_setting': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'built_struc_age': {'@type': '@id'},\n", + " 'built_struc_set': {'@type': '@id'},\n", + " 'built_struc_type': {'@type': '@id'},\n", + " 'bulk_elect_conductivity': {'@type': '@id'},\n", + " 'calcium': {'@type': '@id'},\n", + " 'carb_dioxide': {'@type': '@id'},\n", + " 'carb_monoxide': {'@type': '@id'},\n", + " 'carb_nitro_ratio': {'@type': '@id'},\n", + " 'cas': 'https://bioregistry.io/cas:',\n", + " 'ceil_area': {'@type': '@id'},\n", + " 'ceil_cond': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'ceil_finish_mat': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'ceil_struc': {'@type': '@id'},\n", + " 'ceil_texture': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'ceil_thermal_mass': {'@type': '@id'},\n", + " 'ceil_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'ceil_water_mold': {'@type': '@id'},\n", + " 'chem_administration': {'@type': '@id'},\n", + " 'chem_mutagen': {'@type': '@id'},\n", + " 'chem_oxygen_dem': {'@type': '@id'},\n", + " 'chem_treat_method': {},\n", + " 'chem_treatment': {'@type': '@id'},\n", + " 'chemical': {'@type': '@id'},\n", + " 'chimera_check': {'@type': '@id'},\n", + " 'chloride': {'@type': '@id'},\n", + " 'chlorophyll': {'@type': '@id'},\n", + " 'climate_environment': {'@type': '@id'},\n", + " 'collected_from': {'@type': '@id'},\n", + " 'collecting_biosamples_from_site_set': {'@type': '@id'},\n", + " 'collection_date': {'@type': '@id'},\n", + " 'completeness': {'@type': 'xsd:float'},\n", + " 'compound': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'concentration': {'@type': '@id'},\n", + " 'conduc': {'@type': '@id'},\n", + " 'contained_in': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'container_size': {'@type': '@id'},\n", + " 'contamination': {'@type': 'xsd:float'},\n", + " 'contig_bp': {'@type': 'xsd:float'},\n", + " 'contigs': {'@type': 'xsd:float'},\n", + " 'cool_syst_id': {'@type': '@id'},\n", + " 'count': {'@type': 'xsd:integer'},\n", + " 'crop_rotation': {'@type': '@id'},\n", + " 'ctg_l50': {'@type': 'xsd:float'},\n", + " 'ctg_l90': {'@type': 'xsd:float'},\n", + " 'ctg_logsum': {'@type': 'xsd:float'},\n", + " 'ctg_max': {'@type': 'xsd:float'},\n", + " 'ctg_n50': {'@type': 'xsd:float'},\n", + " 'ctg_n90': {'@type': 'xsd:float'},\n", + " 'ctg_powsum': {'@type': 'xsd:float'},\n", + " 'cult_root_med': {'@type': '@id'},\n", + " 'cur_land_use': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'cur_vegetation': {'@type': '@id'},\n", + " 'cur_vegetation_meth': {'@type': '@id'},\n", + " 'data_object_set': {'@type': '@id'},\n", + " 'data_object_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'date_last_rain': {'@type': '@id'},\n", + " 'dcterms': 'http://purl.org/dc/terms/',\n", + " 'density': {'@type': '@id'},\n", + " 'depos_env': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'depth': {'@type': '@id'},\n", + " 'description': {},\n", + " 'designated_class': {'@type': '@id'},\n", + " 'dew_point': {'@type': '@id'},\n", + " 'diether_lipids': {'@type': '@id'},\n", + " 'display_order': {'@type': 'xsd:integer'},\n", + " 'diss_carb_dioxide': {'@type': '@id'},\n", + " 'diss_hydrogen': {'@type': '@id'},\n", + " 'diss_inorg_carb': {'@type': '@id'},\n", + " 'diss_inorg_nitro': {'@type': '@id'},\n", + " 'diss_inorg_phosp': {'@type': '@id'},\n", + " 'diss_iron': {'@type': '@id'},\n", + " 'diss_org_carb': {'@type': '@id'},\n", + " 'diss_org_nitro': {'@type': '@id'},\n", + " 'diss_oxygen': {'@type': '@id'},\n", + " 'diss_oxygen_fluid': {'@type': '@id'},\n", + " 'dna_absorb1': {'@type': 'xsd:float'},\n", + " 'dna_absorb2': {'@type': 'xsd:float'},\n", + " 'dna_concentration': {'@type': 'xsd:float'},\n", + " 'dna_cont_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'dna_dnase': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'dna_sample_format': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'dna_volume': {'@type': 'xsd:float'},\n", + " 'dnase_rna': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'doi': 'https://bioregistry.io/doi:',\n", + " 'doi_category': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'doi_provider': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'doi_value': {'@type': '@id'},\n", + " 'door_comp_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'door_cond': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'door_direct': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'door_loc': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'door_mat': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'door_move': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'door_size': {'@type': '@id'},\n", + " 'door_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'door_type_metal': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'door_type_wood': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'door_water_mold': {'@type': '@id'},\n", + " 'down_par': {'@type': '@id'},\n", + " 'drainage_class': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'drawings': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'duration': {'@type': '@id'},\n", + " 'edam.data': {'@prefix': True},\n", + " 'efficiency_percent': {'@type': '@id'},\n", + " 'elev': {'@type': '@id'},\n", + " 'elevator': {'@type': '@id'},\n", + " 'email': {},\n", + " 'embargoed': {'@type': 'xsd:boolean'},\n", + " 'emsl': 'http://example.org/emsl_in_mongodb/',\n", + " 'emsl.project': 'https://bioregistry.io/emsl.project:',\n", + " 'emsl_biosample_identifiers': {'@type': '@id'},\n", + " 'emsl_project_identifiers': {'@type': '@id'},\n", + " 'emsl_uuid_like': 'http://example.org/emsl_uuid_like/',\n", + " 'emulsions': {'@type': '@id'},\n", + " 'encodes': {'@type': '@id'},\n", + " 'end': {'@type': 'xsd:integer'},\n", + " 'env_broad_scale': {'@type': '@id'},\n", + " 'env_local_scale': {'@type': '@id'},\n", + " 'env_medium': {'@type': '@id'},\n", + " 'env_package': {'@type': '@id'},\n", + " 'escalator': {'@type': '@id'},\n", + " 'ethylbenzene': {'@type': '@id'},\n", + " 'exp_duct': {'@type': '@id'},\n", + " 'exp_pipe': {'@type': '@id'},\n", + " 'experimental_factor': {'@type': '@id'},\n", + " 'ext_door': {'@type': '@id'},\n", + " 'ext_wall_orient': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'ext_window_orient': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'external_database_identifiers': {'@type': '@id'},\n", + " 'extractant': {'@type': '@id'},\n", + " 'extraction_method': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'extraction_set': {'@type': '@id'},\n", + " 'extraction_target': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'extreme_event': {'@type': '@id'},\n", + " 'fao_class': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'fertilizer_regm': {'@type': '@id'},\n", + " 'field': {'@type': '@id'},\n", + " 'field_research_site_set': {'@type': '@id'},\n", + " 'file_size_bytes': {'@type': 'xsd:long'},\n", + " 'filter_pore_size': {'@type': '@id'},\n", + " 'filter_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'fire': {'@type': '@id'},\n", + " 'fireplace_type': {'@type': '@id'},\n", + " 'flooding': {'@type': '@id'},\n", + " 'floor_age': {'@type': '@id'},\n", + " 'floor_area': {'@type': '@id'},\n", + " 'floor_cond': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'floor_count': {'@type': '@id'},\n", + " 'floor_finish_mat': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'floor_struc': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'floor_thermal_mass': {'@type': '@id'},\n", + " 'floor_water_mold': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'fluor': {'@type': '@id'},\n", + " 'freq_clean': {'@type': '@id'},\n", + " 'freq_cook': {'@type': '@id'},\n", + " 'functional_annotation_agg': {'@type': '@id'},\n", + " 'functional_annotation_set': {'@type': '@id'},\n", + " 'fungicide_regm': {'@type': '@id'},\n", + " 'furniture': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'gap_pct': {'@type': 'xsd:float'},\n", + " 'gaseous_environment': {'@type': '@id'},\n", + " 'gaseous_substances': {'@type': '@id'},\n", + " 'gc_avg': {'@type': 'xsd:float'},\n", + " 'gc_std': {'@type': 'xsd:float'},\n", + " 'gender_restroom': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'gene_count': {'@type': 'xsd:integer'},\n", + " 'gene_function_id': {'@type': '@id'},\n", + " 'generic': 'https://example.org/generic/',\n", + " 'genetic_mod': {'@type': '@id'},\n", + " 'genome_feature_set': {'@type': '@id'},\n", + " 'geo_loc_name': {'@type': '@id'},\n", + " 'gff_coordinate': {'@type': 'xsd:integer'},\n", + " 'glucosidase_act': {'@type': '@id'},\n", + " 'gnps.task': 'https://bioregistry.io/gnps.task:',\n", + " 'gnps_task_identifiers': {'@type': '@id'},\n", + " 'gold': 'https://bioregistry.io/gold:',\n", + " 'gold_analysis_project_identifiers': {'@type': '@id'},\n", + " 'gold_biosample_identifiers': {'@type': '@id'},\n", + " 'gold_sequencing_project_identifiers': {'@type': '@id'},\n", + " 'gold_study_identifiers': {'@type': '@id'},\n", + " 'gravidity': {'@type': '@id'},\n", + " 'gravity': {'@type': '@id'},\n", + " 'growth_facil': {'@type': '@id'},\n", + " 'growth_habit': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'growth_hormone_regm': {'@type': '@id'},\n", + " 'gtpo': 'http://example.org/gtpo/',\n", + " 'hall_count': {'@type': '@id'},\n", + " 'handidness': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'has_boolean_value': {'@type': 'xsd:boolean'},\n", + " 'has_credit_associations': {'@type': '@id'},\n", + " 'has_failure_categorization': {'@type': '@id'},\n", + " 'has_input': {'@type': '@id'},\n", + " 'has_maximum_numeric_value': {'@type': 'xsd:float'},\n", + " 'has_metabolite_quantifications': {'@type': '@id'},\n", + " 'has_minimum_numeric_value': {'@type': 'xsd:float'},\n", + " 'has_numeric_value': {'@type': 'xsd:float'},\n", + " 'has_output': {'@type': '@id'},\n", + " 'has_part': {'@type': '@id'},\n", + " 'has_peptide_quantifications': {'@type': '@id'},\n", + " 'has_solution_components': {'@type': '@id'},\n", + " 'hc_produced': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'hcr': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'hcr_fw_salinity': {'@type': '@id'},\n", + " 'hcr_geol_age': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'hcr_pressure': {'@type': '@id'},\n", + " 'hcr_temp': {'@type': '@id'},\n", + " 'heat_cool_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'heat_deliv_loc': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'heat_sys_deliv_meth': {},\n", + " 'heat_system_id': {'@type': '@id'},\n", + " 'heavy_metals': {'@type': '@id'},\n", + " 'heavy_metals_meth': {'@type': '@id'},\n", + " 'height_carper_fiber': {'@type': '@id'},\n", + " 'herbicide_regm': {'@type': '@id'},\n", + " 'highest_similarity_score': {'@type': 'xsd:float'},\n", + " 'horizon_meth': {'@type': '@id'},\n", + " 'host_age': {'@type': '@id'},\n", + " 'host_body_habitat': {'@type': '@id'},\n", + " 'host_body_product': {'@type': '@id'},\n", + " 'host_body_site': {'@type': '@id'},\n", + " 'host_body_temp': {'@type': '@id'},\n", + " 'host_color': {'@type': '@id'},\n", + " 'host_common_name': {'@type': '@id'},\n", + " 'host_diet': {'@type': '@id'},\n", + " 'host_disease_stat': {'@type': '@id'},\n", + " 'host_dry_mass': {'@type': '@id'},\n", + " 'host_family_relation': {},\n", + " 'host_genotype': {'@type': '@id'},\n", + " 'host_growth_cond': {'@type': '@id'},\n", + " 'host_height': {'@type': '@id'},\n", + " 'host_last_meal': {'@type': '@id'},\n", + " 'host_length': {'@type': '@id'},\n", + " 'host_life_stage': {'@type': '@id'},\n", + " 'host_phenotype': {'@type': '@id'},\n", + " 'host_sex': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'host_shape': {'@type': '@id'},\n", + " 'host_subject_id': {'@type': '@id'},\n", + " 'host_subspecf_genlin': {},\n", + " 'host_substrate': {'@type': '@id'},\n", + " 'host_symbiont': {},\n", + " 'host_taxid': {'@type': '@id'},\n", + " 'host_tot_mass': {'@type': '@id'},\n", + " 'host_wet_mass': {'@type': '@id'},\n", + " 'humidity': {'@type': '@id'},\n", + " 'humidity_regm': {'@type': '@id'},\n", + " 'id': '@id',\n", + " 'igsn': 'https://app.geosamples.org/sample/igsn/',\n", + " 'igsn_biosample_identifiers': {'@type': '@id'},\n", + " 'img.taxon': 'https://bioregistry.io/img.taxon:',\n", + " 'img_identifiers': {'@type': '@id'},\n", + " 'indoor_space': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'indoor_surf': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'indust_eff_percent': {'@type': '@id'},\n", + " 'inorg_particles': {'@type': '@id'},\n", + " 'input_base_count': {'@type': 'xsd:float'},\n", + " 'input_contig_num': {'@type': 'xsd:integer'},\n", + " 'input_mass': {'@type': '@id'},\n", + " 'input_read_bases': {'@type': 'xsd:float'},\n", + " 'input_read_count': {'@type': 'xsd:float'},\n", + " 'input_volume': {'@type': '@id'},\n", + " 'insdc_analysis_identifiers': {'@type': '@id'},\n", + " 'insdc_bioproject_identifiers': {'@type': '@id'},\n", + " 'insdc_biosample_identifiers': {'@type': '@id'},\n", + " 'insdc_experiment_identifiers': {'@type': '@id'},\n", + " 'insdc_secondary_sample_identifiers': {'@type': '@id'},\n", + " 'insdc_sra_ena_study_identifiers': {'@type': '@id'},\n", + " 'inside_lux': {'@type': '@id'},\n", + " 'int_wall_cond': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'is_balanced': {'@type': 'xsd:boolean'},\n", + " 'is_diastereoselective': {'@type': 'xsd:boolean'},\n", + " 'is_fully_characterized': {'@type': 'xsd:boolean'},\n", + " 'is_pressurized': {'@type': 'xsd:boolean'},\n", + " 'is_stereo': {'@type': 'xsd:boolean'},\n", + " 'is_transport': {'@type': 'xsd:boolean'},\n", + " 'iw_bt_date_well': {'@type': '@id'},\n", + " 'iwf': {'@type': '@id'},\n", + " 'jgi': 'http://example.org/jgi/',\n", + " 'jgi.proposal': 'https://bioregistry.io/jgi.proposal:',\n", + " 'jgi_portal_study_identifiers': {'@type': '@id'},\n", + " 'kegg': 'https://bioregistry.io/kegg:',\n", + " 'language': {'@type': 'xsd:language'},\n", + " 'last_clean': {'@type': '@id'},\n", + " 'lat_lon': {'@type': '@id'},\n", + " 'latitude': {'@type': 'xsd:decimal'},\n", + " 'lbc_thirty': {'@type': '@id'},\n", + " 'lbceq': {'@type': '@id'},\n", + " 'left_participants': {'@type': '@id'},\n", + " 'library_preparation_set': {'@type': '@id'},\n", + " 'library_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'light_intensity': {'@type': '@id'},\n", + " 'light_regm': {'@type': '@id'},\n", + " 'light_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'link_addit_analys': {'@type': '@id'},\n", + " 'link_class_info': {'@type': '@id'},\n", + " 'link_climate_info': {'@type': '@id'},\n", + " 'linkml': 'https://w3id.org/linkml/',\n", + " 'lithology': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'local_class': {'@type': '@id'},\n", + " 'local_class_meth': {'@type': '@id'},\n", + " 'longitude': {'@type': 'xsd:decimal'},\n", + " 'low_depth_contig_num': {'@type': 'xsd:integer'},\n", + " 'magnesium': {'@type': '@id'},\n", + " 'mags_activity_set': {'@type': '@id'},\n", + " 'mags_list': {'@type': '@id'},\n", + " 'manganese': {'@type': '@id'},\n", + " 'mass': {'@type': '@id'},\n", + " 'max_occup': {'@type': '@id'},\n", + " 'mean_frict_vel': {'@type': '@id'},\n", + " 'mean_peak_frict_vel': {'@type': '@id'},\n", + " 'mech_struc': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'mechanical_damage': {'@type': '@id'},\n", + " 'metabolite_quantified': {'@type': '@id'},\n", + " 'metabolomics_analysis_activity_set': {'@type': '@id'},\n", + " 'metagenome_annotation_activity_set': {'@type': '@id'},\n", + " 'metagenome_annotation_id': {'@type': '@id'},\n", + " 'metagenome_assembly_set': {'@type': '@id'},\n", + " 'metagenome_sequencing_activity_set': {'@type': '@id'},\n", + " 'metaproteomics_analysis_activity_set': {'@type': '@id'},\n", + " 'metatranscriptome_activity_set': {'@type': '@id'},\n", + " 'methane': {'@type': '@id'},\n", + " 'mgnify.proj': 'https://bioregistry.io/mgnify.proj:',\n", + " 'mgnify_analysis_identifiers': {'@type': '@id'},\n", + " 'mgnify_project_identifiers': {'@type': '@id'},\n", + " 'micro_biomass_meth': {},\n", + " 'microbial_biomass': {'@type': '@id'},\n", + " 'min_q_value': {'@type': 'xsd:float'},\n", + " 'mineral_nutr_regm': {'@type': '@id'},\n", + " 'misc_param': {'@type': '@id'},\n", + " 'model': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'my_emsl': 'https://release.my.emsl.pnnl.gov/released_data/',\n", + " 'n_alkanes': {'@type': '@id'},\n", + " 'neon.identifier': 'http://example.org/neon/identifier/',\n", + " 'neon.schema': 'http://example.org/neon/schema/',\n", + " 'neon_biosample_identifiers': {'@type': '@id'},\n", + " 'neon_study_identifiers': {'@type': '@id'},\n", + " 'nitrate': {'@type': '@id'},\n", + " 'nitrate_nitrogen': {'@type': '@id'},\n", + " 'nitrite': {'@type': '@id'},\n", + " 'nitrite_nitrogen': {'@type': '@id'},\n", + " 'nitro': {'@type': '@id'},\n", + " 'nmdc': 'https://w3id.org/nmdc/',\n", + " 'nom_analysis_activity_set': {'@type': '@id'},\n", + " 'non_min_nutr_regm': {},\n", + " 'nucl_acid_amp': {'@type': '@id'},\n", + " 'nucl_acid_ext': {'@type': '@id'},\n", + " 'num_16s': {'@type': 'xsd:integer'},\n", + " 'num_23s': {'@type': 'xsd:integer'},\n", + " 'num_5s': {'@type': 'xsd:integer'},\n", + " 'num_aligned_reads': {'@type': 'xsd:float'},\n", + " 'num_input_reads': {'@type': 'xsd:float'},\n", + " 'num_t_rna': {'@type': 'xsd:integer'},\n", + " 'number_of_contig': {'@type': 'xsd:integer'},\n", + " 'number_pets': {'@type': '@id'},\n", + " 'number_plants': {'@type': '@id'},\n", + " 'number_resident': {'@type': '@id'},\n", + " 'occup_density_samp': {'@type': '@id'},\n", + " 'occup_document': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'occup_samp': {'@type': '@id'},\n", + " 'omics_processing_identifiers': {'@type': '@id'},\n", + " 'omics_processing_set': {'@type': '@id'},\n", + " 'omics_type': {'@type': '@id'},\n", + " 'ordered_mobile_phases': {'@type': '@id'},\n", + " 'org_carb': {'@type': '@id'},\n", + " 'org_count_qpcr_info': {},\n", + " 'org_matter': {'@type': '@id'},\n", + " 'org_nitro': {'@type': '@id'},\n", + " 'org_particles': {'@type': '@id'},\n", + " 'organism_count': {'@type': '@id'},\n", + " 'output_base_count': {'@type': 'xsd:float'},\n", + " 'output_read_bases': {'@type': 'xsd:float'},\n", + " 'output_read_count': {'@type': 'xsd:float'},\n", + " 'owc_tvdss': {'@type': '@id'},\n", + " 'oxy_stat_samp': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'oxygen': {'@type': '@id'},\n", + " 'part_of': {'@type': '@id'},\n", + " 'part_org_carb': {'@type': '@id'},\n", + " 'part_org_nitro': {'@type': '@id'},\n", + " 'particle_class': {'@type': '@id'},\n", + " 'pcr_cond': {'@type': '@id'},\n", + " 'pcr_cycles': {'@type': 'xsd:integer'},\n", + " 'pcr_primers': {'@type': '@id'},\n", + " 'peptide_sequence_count': {'@type': 'xsd:integer'},\n", + " 'peptide_spectral_count': {'@type': 'xsd:integer'},\n", + " 'peptide_sum_masic_abundance': {'@type': 'xsd:integer'},\n", + " 'permeability': {'@type': '@id'},\n", + " 'perturbation': {'@type': '@id'},\n", + " 'pesticide_regm': {'@type': '@id'},\n", + " 'petroleum_hydrocarb': {'@type': '@id'},\n", + " 'ph': {'@type': 'xsd:double'},\n", + " 'ph_meth': {'@type': '@id'},\n", + " 'ph_regm': {'@type': '@id'},\n", + " 'phaeopigments': {'@type': '@id'},\n", + " 'phase': {'@type': 'xsd:integer'},\n", + " 'phosphate': {'@type': '@id'},\n", + " 'phosplipid_fatt_acid': {'@type': '@id'},\n", + " 'photon_flux': {'@type': '@id'},\n", + " 'planned_process_set': {'@type': '@id'},\n", + " 'plant_growth_med': {'@type': '@id'},\n", + " 'plant_product': {'@type': '@id'},\n", + " 'plant_sex': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'plant_struc': {'@type': '@id'},\n", + " 'pollutants': {'@type': '@id'},\n", + " 'pool_dna_extracts': {'@type': '@id'},\n", + " 'pooling_set': {'@type': '@id'},\n", + " 'porosity': {'@type': '@id'},\n", + " 'potassium': {'@type': '@id'},\n", + " 'pour_point': {'@type': '@id'},\n", + " 'pre_treatment': {'@type': '@id'},\n", + " 'pres_animal_insect': {},\n", + " 'pressure': {'@type': '@id'},\n", + " 'prev_land_use_meth': {},\n", + " 'previous_land_use': {'@type': '@id'},\n", + " 'primary_prod': {'@type': '@id'},\n", + " 'primary_treatment': {'@type': '@id'},\n", + " 'principal_investigator': {'@type': '@id'},\n", + " 'processed_sample_set': {'@type': '@id'},\n", + " 'processing_institution': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'prod_rate': {'@type': '@id'},\n", + " 'prod_start_date': {'@type': '@id'},\n", + " 'profile_position': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'protein_spectral_count': {'@type': 'xsd:integer'},\n", + " 'protein_sum_masic_abundance': {'@type': 'xsd:integer'},\n", + " 'protocol_link': {'@type': '@id'},\n", + " 'prov': 'http://www.w3.org/ns/prov#',\n", + " 'qc_failure_what': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'qc_failure_where': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'qc_status': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'quad_pos': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'qud': 'http://qudt.org/1.1/schema/qudt#',\n", + " 'radiation_regm': {'@type': '@id'},\n", + " 'rainfall_regm': {'@type': '@id'},\n", + " 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',\n", + " 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',\n", + " 'reactor_type': {'@type': '@id'},\n", + " 'read_based_taxonomy_analysis_activity_set': {'@type': '@id'},\n", + " 'read_qc_analysis_activity_set': {'@type': '@id'},\n", + " 'redox_potential': {'@type': '@id'},\n", + " 'rel_air_humidity': {'@type': '@id'},\n", + " 'rel_humidity_out': {'@type': '@id'},\n", + " 'rel_samp_loc': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'reservoir': {'@type': '@id'},\n", + " 'resins_pc': {'@type': '@id'},\n", + " 'right_participants': {'@type': '@id'},\n", + " 'rna_absorb1': {'@type': 'xsd:float'},\n", + " 'rna_absorb2': {'@type': 'xsd:float'},\n", + " 'rna_concentration': {'@type': 'xsd:float'},\n", + " 'rna_cont_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'rna_sample_format': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'rna_volume': {'@type': 'xsd:float'},\n", + " 'room_air_exch_rate': {'@type': '@id'},\n", + " 'room_architec_elem': {},\n", + " 'room_condt': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'room_connected': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'room_count': {'@type': '@id'},\n", + " 'room_dim': {'@type': '@id'},\n", + " 'room_door_dist': {'@type': '@id'},\n", + " 'room_door_share': {'@type': '@id'},\n", + " 'room_hallway': {'@type': '@id'},\n", + " 'room_loc': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'room_moist_dam_hist': {'@type': 'xsd:integer'},\n", + " 'room_net_area': {'@type': '@id'},\n", + " 'room_occup': {'@type': '@id'},\n", + " 'room_samp_pos': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'room_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'room_vol': {'@type': '@id'},\n", + " 'room_wall_share': {'@type': '@id'},\n", + " 'room_window_count': {'@type': 'xsd:integer'},\n", + " 'root_cond': {'@type': '@id'},\n", + " 'root_med_carbon': {'@type': '@id'},\n", + " 'root_med_macronutr': {'@type': '@id'},\n", + " 'root_med_micronutr': {'@type': '@id'},\n", + " 'root_med_ph': {'@type': '@id'},\n", + " 'root_med_regl': {'@type': '@id'},\n", + " 'root_med_solid': {'@type': '@id'},\n", + " 'root_med_suppl': {'@type': '@id'},\n", + " 'salinity': {'@type': '@id'},\n", + " 'salinity_meth': {'@type': '@id'},\n", + " 'salt_regm': {'@type': '@id'},\n", + " 'samp_capt_status': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'samp_collec_device': {},\n", + " 'samp_collec_method': {},\n", + " 'samp_collect_point': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'samp_dis_stage': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'samp_floor': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'samp_loc_corr_rate': {'@type': '@id'},\n", + " 'samp_mat_process': {'@type': '@id'},\n", + " 'samp_md': {'@type': '@id'},\n", + " 'samp_name': {},\n", + " 'samp_preserv': {'@type': '@id'},\n", + " 'samp_room_id': {'@type': '@id'},\n", + " 'samp_size': {'@type': '@id'},\n", + " 'samp_sort_meth': {'@type': '@id'},\n", + " 'samp_store_dur': {'@type': '@id'},\n", + " 'samp_store_loc': {'@type': '@id'},\n", + " 'samp_store_temp': {'@type': '@id'},\n", + " 'samp_subtype': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'samp_taxon_id': {'@type': '@id'},\n", + " 'samp_time_out': {'@type': '@id'},\n", + " 'samp_transport_cond': {'@type': '@id'},\n", + " 'samp_tvdss': {'@type': '@id'},\n", + " 'samp_type': {'@type': '@id'},\n", + " 'samp_vol_we_dna_ext': {'@type': '@id'},\n", + " 'samp_weather': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'samp_well_name': {'@type': '@id'},\n", + " 'sample_collection_day': {'@type': 'xsd:integer'},\n", + " 'sample_collection_hour': {'@type': 'xsd:integer'},\n", + " 'sample_collection_minute': {'@type': 'xsd:integer'},\n", + " 'sample_collection_year': {'@type': 'xsd:integer'},\n", + " 'sample_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'saturates_pc': {'@type': '@id'},\n", + " 'scaf_bp': {'@type': 'xsd:float'},\n", + " 'scaf_l50': {'@type': 'xsd:float'},\n", + " 'scaf_l90': {'@type': 'xsd:float'},\n", + " 'scaf_l_gt50k': {'@type': 'xsd:float'},\n", + " 'scaf_logsum': {'@type': 'xsd:float'},\n", + " 'scaf_max': {'@type': 'xsd:float'},\n", + " 'scaf_n50': {'@type': 'xsd:float'},\n", + " 'scaf_n90': {'@type': 'xsd:float'},\n", + " 'scaf_n_gt50k': {'@type': 'xsd:float'},\n", + " 'scaf_pct_gt50k': {'@type': 'xsd:float'},\n", + " 'scaf_powsum': {'@type': 'xsd:float'},\n", + " 'scaffolds': {'@type': 'xsd:float'},\n", + " 'schema': 'http://schema.org/',\n", + " 'season': {'@type': '@id'},\n", + " 'season_environment': {'@type': '@id'},\n", + " 'season_precpt': {'@type': '@id'},\n", + " 'season_temp': {'@type': '@id'},\n", + " 'season_use': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'secondary_treatment': {'@type': '@id'},\n", + " 'sediment_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'separation_method': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'seq_meth': {'@type': '@id'},\n", + " 'seq_quality_check': {'@type': '@id'},\n", + " 'sewage_type': {'@type': '@id'},\n", + " 'shad_dev_water_mold': {},\n", + " 'shading_device_cond': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'shading_device_loc': {'@type': '@id'},\n", + " 'shading_device_mat': {'@type': '@id'},\n", + " 'shading_device_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'shex': 'http://www.w3.org/ns/shex#',\n", + " 'sieving': {'@type': '@id'},\n", + " 'silicate': {'@type': '@id'},\n", + " 'size_frac': {'@type': '@id'},\n", + " 'size_frac_low': {'@type': '@id'},\n", + " 'size_frac_up': {'@type': '@id'},\n", + " 'skos': 'http://www.w3.org/2004/02/skos/core#',\n", + " 'slope_aspect': {'@type': '@id'},\n", + " 'slope_gradient': {'@type': '@id'},\n", + " 'sludge_retent_time': {'@type': '@id'},\n", + " 'sodium': {'@type': '@id'},\n", + " 'soil_horizon': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'soil_text_measure': {'@type': '@id'},\n", + " 'soil_texture_meth': {},\n", + " 'soil_type': {'@type': '@id'},\n", + " 'soil_type_meth': {'@type': '@id'},\n", + " 'solar_irradiance': {'@type': '@id'},\n", + " 'soluble_inorg_mat': {'@type': '@id'},\n", + " 'soluble_org_mat': {'@type': '@id'},\n", + " 'soluble_react_phosp': {'@type': '@id'},\n", + " 'source_mat_id': {'@type': '@id'},\n", + " 'space_typ_state': {'@type': '@id'},\n", + " 'specific': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'specific_humidity': {'@type': '@id'},\n", + " 'sr_dep_env': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'sr_geol_age': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'sr_kerog_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'sr_lithology': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'standing_water_regm': {'@type': '@id'},\n", + " 'start': {'@type': 'xsd:integer'},\n", + " 'stationary_phase': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'stoichiometry': {'@type': 'xsd:integer'},\n", + " 'store_cond': {'@type': '@id'},\n", + " 'study_category': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'study_identifiers': {'@type': '@id'},\n", + " 'study_image': {'@type': '@id'},\n", + " 'study_set': {'@type': '@id'},\n", + " 'subject': {'@type': '@id'},\n", + " 'substructure_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'subsurface_depth': {'@type': '@id'},\n", + " 'sulfate': {'@type': '@id'},\n", + " 'sulfate_fw': {'@type': '@id'},\n", + " 'sulfide': {'@type': '@id'},\n", + " 'surf_air_cont': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'surf_humidity': {'@type': '@id'},\n", + " 'surf_material': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'surf_moisture': {'@type': '@id'},\n", + " 'surf_moisture_ph': {'@type': 'xsd:double'},\n", + " 'surf_temp': {'@type': '@id'},\n", + " 'suspend_part_matter': {'@type': '@id'},\n", + " 'suspend_solids': {'@type': '@id'},\n", + " 'tan': {'@type': '@id'},\n", + " 'target_gene': {'@type': '@id'},\n", + " 'target_subfragment': {'@type': '@id'},\n", + " 'temp': {'@type': '@id'},\n", + " 'temp_out': {'@type': '@id'},\n", + " 'temperature': {'@type': '@id'},\n", + " 'term': {'@type': '@id'},\n", + " 'tertiary_treatment': {'@type': '@id'},\n", + " 'tidal_stage': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'tillage': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'tiss_cult_growth_med': {'@type': '@id'},\n", + " 'toluene': {'@type': '@id'},\n", + " 'too_short_contig_num': {'@type': 'xsd:integer'},\n", + " 'tot_carb': {'@type': '@id'},\n", + " 'tot_depth_water_col': {'@type': '@id'},\n", + " 'tot_diss_nitro': {'@type': '@id'},\n", + " 'tot_inorg_nitro': {'@type': '@id'},\n", + " 'tot_iron': {'@type': '@id'},\n", + " 'tot_nitro': {'@type': '@id'},\n", + " 'tot_nitro_cont_meth': {},\n", + " 'tot_nitro_content': {'@type': '@id'},\n", + " 'tot_org_c_meth': {'@type': '@id'},\n", + " 'tot_org_carb': {'@type': '@id'},\n", + " 'tot_part_carb': {'@type': '@id'},\n", + " 'tot_phosp': {'@type': '@id'},\n", + " 'tot_phosphate': {'@type': '@id'},\n", + " 'tot_sulfur': {'@type': '@id'},\n", + " 'total_bases': {'@type': 'xsd:integer'},\n", + " 'train_line': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'train_stat_loc': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'train_stop_loc': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'turbidity': {'@type': '@id'},\n", + " 'tvdss_of_hcr_press': {'@type': '@id'},\n", + " 'tvdss_of_hcr_temp': {'@type': '@id'},\n", + " 'typ_occup_density': {'@type': 'xsd:double'},\n", + " 'unbinned_contig_num': {'@type': 'xsd:integer'},\n", + " 'value': {'@type': '@id'},\n", + " 'vendor': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'ventilation_rate': {'@type': '@id'},\n", + " 'ventilation_type': {'@type': '@id'},\n", + " 'vfa': {'@type': '@id'},\n", + " 'vfa_fw': {'@type': '@id'},\n", + " 'vis_media': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'viscosity': {'@type': '@id'},\n", + " 'volatile_org_comp': {'@type': '@id'},\n", + " 'volume': {'@type': '@id'},\n", + " 'wall_area': {'@type': '@id'},\n", + " 'wall_const_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'wall_finish_mat': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'wall_height': {'@type': '@id'},\n", + " 'wall_loc': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'wall_surf_treatment': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'wall_texture': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'wall_thermal_mass': {'@type': '@id'},\n", + " 'wall_water_mold': {'@type': '@id'},\n", + " 'was_generated_by': {'@type': '@id'},\n", + " 'was_informed_by': {'@type': '@id'},\n", + " 'wastewater_type': {'@type': '@id'},\n", + " 'water_cont_soil_meth': {},\n", + " 'water_content': {'@type': '@id'},\n", + " 'water_current': {'@type': '@id'},\n", + " 'water_cut': {'@type': '@id'},\n", + " 'water_feat_size': {'@type': '@id'},\n", + " 'water_feat_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'water_prod_rate': {'@type': '@id'},\n", + " 'water_temp_regm': {'@type': '@id'},\n", + " 'watering_regm': {'@type': '@id'},\n", + " 'weekday': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'wgs84': 'http://www.w3.org/2003/01/geo/wgs84_pos#',\n", + " 'wikidata': 'http://www.wikidata.org/entity/',\n", + " 'win': {'@type': '@id'},\n", + " 'wind_direction': {'@type': '@id'},\n", + " 'wind_speed': {'@type': '@id'},\n", + " 'window_cond': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'window_cover': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'window_horiz_pos': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'window_loc': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'window_mat': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'window_open_freq': {'@type': '@id'},\n", + " 'window_size': {'@type': '@id'},\n", + " 'window_status': {'@type': '@id'},\n", + " 'window_type': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'window_vert_pos': {'@context': {'@vocab': '@null',\n", + " 'description': 'skos:prefLabel',\n", + " 'meaning': '@id',\n", + " 'text': 'skos:notation'}},\n", + " 'window_water_mold': {'@type': '@id'},\n", + " 'xsd': 'http://www.w3.org/2001/XMLSchema#',\n", + " 'xylene': {'@type': '@id'},\n", + " 'zinc': {'@type': '@id'}}\n" + ] + } + ], "source": [ "import json\n", "from pprint import pprint\n", @@ -132,7 +1347,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "62a68c07-0706-4300-a48d-0ab628af87b1", "metadata": {}, "outputs": [], @@ -150,7 +1365,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "648b4f70-34d6-4c70-8d0a-ef76e7e5d96d", "metadata": {}, "outputs": [], @@ -170,7 +1385,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "4d802017-2a7e-4614-b662-6a0cc027b8bc", "metadata": {}, "outputs": [], @@ -196,10 +1411,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "4251e0b1-35dc-4f40-91e7-b9bc0d9d79e1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "911ca45777004986ba5c7e8328f1373e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/112 [00:00POS: 100,000 slots (Batch: 1,388,888 slots/s / Avg: 1,388,888 slots/s)\n", + "20:08:42 INFO loader :: Index SPO->POS: 200,000 slots (Batch: 1,265,822 slots/s / Avg: 1,324,503 slots/s)\n", + "20:08:42 INFO loader :: Index SPO->POS: 300,000 slots (Batch: 1,219,512 slots/s / Avg: 1,287,553 slots/s)\n", + "20:08:42 INFO loader :: Index SPO->POS: 400,000 slots (Batch: 900,900 slots/s / Avg: 1,162,790 slots/s)\n", + "20:08:42 INFO loader :: Index SPO->POS: 500,000 slots (Batch: 1,136,363 slots/s / Avg: 1,157,407 slots/s)\n", + "20:08:42 INFO loader :: Index SPO->POS: 600,000 slots (Batch: 1,075,268 slots/s / Avg: 1,142,857 slots/s)\n", + "20:08:42 INFO loader :: Index SPO->POS: 700,000 slots (Batch: 840,336 slots/s / Avg: 1,086,956 slots/s)\n", + "20:08:42 INFO loader :: Index SPO->POS: 800,000 slots (Batch: 1,111,111 slots/s / Avg: 1,089,918 slots/s)\n", + "20:08:42 INFO loader :: Index SPO->POS: 900,000 slots (Batch: 1,098,901 slots/s / Avg: 1,090,909 slots/s)\n", + "20:08:42 INFO loader :: Index SPO->POS: 1,000,000 slots (Batch: 900,900 slots/s / Avg: 1,068,376 slots/s)\n", + "20:08:42 INFO loader :: Elapsed: 55.14 seconds [2024/03/07 20:08:42 UTC]\n", + "20:08:42 INFO loader :: Index SPO->POS: 1,100,000 slots (Batch: 1,123,595 slots/s / Avg: 1,073,170 slots/s)\n", + "20:08:43 INFO loader :: Index SPO->POS: 1,200,000 slots (Batch: 1,086,956 slots/s / Avg: 1,074,306 slots/s)\n", + "20:08:43 INFO loader :: Index SPO->POS: 1,300,000 slots (Batch: 787,401 slots/s / Avg: 1,045,016 slots/s)\n", + "20:08:43 INFO loader :: Index SPO->POS: 1,400,000 slots (Batch: 1,000,000 slots/s / Avg: 1,041,666 slots/s)\n", + "20:08:43 INFO loader :: Index SPO->POS: 1,500,000 slots (Batch: 917,431 slots/s / Avg: 1,032,346 slots/s)\n", + "20:08:43 INFO loader :: Index SPO->POS: 1,600,000 slots (Batch: 632,911 slots/s / Avg: 993,171 slots/s)\n", + "20:08:43 INFO loader :: Index SPO->POS: 1,700,000 slots (Batch: 657,894 slots/s / Avg: 964,265 slots/s)\n", + "20:08:43 INFO loader :: Index SPO->POS: 1,800,000 slots (Batch: 952,380 slots/s / Avg: 963,597 slots/s)\n", + "20:08:43 INFO loader :: Index SPO->POS: 1,900,000 slots (Batch: 952,380 slots/s / Avg: 963,000 slots/s)\n", + "20:08:44 INFO loader :: Index SPO->POS: 2,000,000 slots (Batch: 909,090 slots/s / Avg: 960,153 slots/s)\n", + "20:08:44 INFO loader :: Elapsed: 56.29 seconds [2024/03/07 20:08:44 UTC]\n", + "20:08:44 INFO loader :: Index SPO->POS: 2,100,000 slots (Batch: 793,650 slots/s / Avg: 950,656 slots/s)\n", + "20:08:44 INFO loader :: Index SPO->POS: 2,200,000 slots (Batch: 862,068 slots/s / Avg: 946,236 slots/s)\n", + "20:08:44 INFO loader :: Index SPO->POS: 2,300,000 slots (Batch: 854,700 slots/s / Avg: 941,850 slots/s)\n", + "20:08:44 INFO loader :: Index SPO->POS: 2,400,000 slots (Batch: 709,219 slots/s / Avg: 929,152 slots/s)\n", + "20:08:44 INFO loader :: Index SPO->POS: 2,500,000 slots (Batch: 793,650 slots/s / Avg: 922,849 slots/s)\n", + "20:08:44 INFO loader :: Index SPO->POS: 2,600,000 slots (Batch: 819,672 slots/s / Avg: 918,403 slots/s)\n", + "20:08:44 INFO loader :: Index SPO->POS: 2,700,000 slots (Batch: 869,565 slots/s / Avg: 916,496 slots/s)\n", + "20:08:45 INFO loader :: Index SPO->POS: 2,800,000 slots (Batch: 751,879 slots/s / Avg: 909,386 slots/s)\n", + "20:08:45 INFO loader :: Index SPO->POS: 2,900,000 slots (Batch: 917,431 slots/s / Avg: 909,661 slots/s)\n", + "20:08:45 INFO loader :: Index SPO->POS: 3,000,000 slots (Batch: 806,451 slots/s / Avg: 905,797 slots/s)\n", + "20:08:45 INFO loader :: Elapsed: 57.52 seconds [2024/03/07 20:08:45 UTC]\n", + "20:08:45 INFO loader :: Index SPO->POS: 3,100,000 slots (Batch: 793,650 slots/s / Avg: 901,687 slots/s)\n", + "20:08:45 INFO loader :: Index SPO->POS: 3,200,000 slots (Batch: 709,219 slots/s / Avg: 894,104 slots/s)\n", + "20:08:45 INFO loader :: Index SPO->POS: 3,300,000 slots (Batch: 781,250 slots/s / Avg: 890,207 slots/s)\n", + "20:08:45 INFO loader :: Index SPO->POS: 3,400,000 slots (Batch: 775,193 slots/s / Avg: 886,339 slots/s)\n", + "20:08:45 INFO loader :: Index SPO->POS: 3,500,000 slots (Batch: 628,930 slots/s / Avg: 876,095 slots/s)\n", + "20:08:46 INFO loader :: Index SPO->POS: 3,600,000 slots (Batch: 869,565 slots/s / Avg: 875,912 slots/s)\n", + "20:08:46 INFO loader :: Index SPO->POS: 3,700,000 slots (Batch: 862,068 slots/s / Avg: 875,532 slots/s)\n", + "20:08:46 INFO loader :: Index SPO->POS: 3,800,000 slots (Batch: 763,358 slots/s / Avg: 872,159 slots/s)\n", + "20:08:46 INFO loader :: Index SPO->POS: 3,900,000 slots (Batch: 694,444 slots/s / Avg: 866,474 slots/s)\n", + "20:08:46 INFO loader :: Index SPO->POS: 4,000,000 slots (Batch: 787,401 slots/s / Avg: 864,304 slots/s)\n", + "20:08:46 INFO loader :: Elapsed: 58.83 seconds [2024/03/07 20:08:46 UTC]\n", + "20:08:46 INFO loader :: Index SPO->POS: 4,100,000 slots (Batch: 769,230 slots/s / Avg: 861,706 slots/s)\n", + "20:08:46 INFO loader :: Index SPO->POS: 4,200,000 slots (Batch: 800,000 slots/s / Avg: 860,126 slots/s)\n", + "20:08:46 INFO loader :: Index SPO->POS: 4,300,000 slots (Batch: 719,424 slots/s / Avg: 856,232 slots/s)\n", + "20:08:47 INFO loader :: Index SPO->POS: 4,400,000 slots (Batch: 813,008 slots/s / Avg: 855,199 slots/s)\n", + "20:08:47 INFO loader :: Index SPO->POS: 4,500,000 slots (Batch: 800,000 slots/s / Avg: 853,889 slots/s)\n", + "20:08:47 INFO loader :: Index SPO->POS: 4,600,000 slots (Batch: 735,294 slots/s / Avg: 850,906 slots/s)\n", + "20:08:47 INFO loader :: Index SPO->POS: 4,700,000 slots (Batch: 689,655 slots/s / Avg: 846,694 slots/s)\n", + "20:08:47 INFO loader :: Index SPO->POS: 4,800,000 slots (Batch: 769,230 slots/s / Avg: 844,921 slots/s)\n", + "20:08:47 INFO loader :: Index SPO->POS: 4,900,000 slots (Batch: 729,927 slots/s / Avg: 842,213 slots/s)\n", + "20:08:47 INFO loader :: Index SPO->POS: 5,000,000 slots (Batch: 769,230 slots/s / Avg: 840,618 slots/s)\n", + "20:08:47 INFO loader :: Elapsed: 60.15 seconds [2024/03/07 20:08:47 UTC]\n", + "20:08:48 INFO loader :: Index SPO->POS: 5,100,000 slots (Batch: 740,740 slots/s / Avg: 838,402 slots/s)\n", + "20:08:48 INFO loader :: Index SPO->POS: 5,200,000 slots (Batch: 781,250 slots/s / Avg: 837,224 slots/s)\n", + "20:08:48 INFO loader :: Index SPO->POS: 5,300,000 slots (Batch: 769,230 slots/s / Avg: 835,830 slots/s)\n", + "20:08:48 INFO loader :: Index SPO->POS: 5,400,000 slots (Batch: 729,927 slots/s / Avg: 833,590 slots/s)\n", + "20:08:48 INFO loader :: Index SPO->POS: 5,500,000 slots (Batch: 704,225 slots/s / Avg: 830,815 slots/s)\n", + "20:08:48 INFO loader :: Index SPO->POS: 5,600,000 slots (Batch: 775,193 slots/s / Avg: 829,752 slots/s)\n", + "20:08:48 INFO loader :: Index SPO->POS: 5,700,000 slots (Batch: 781,250 slots/s / Avg: 828,849 slots/s)\n", + "20:08:48 INFO loader :: Index SPO->POS: 5,800,000 slots (Batch: 819,672 slots/s / Avg: 828,689 slots/s)\n", + "20:08:49 INFO loader :: Index SPO->POS: 5,900,000 slots (Batch: 781,250 slots/s / Avg: 827,837 slots/s)\n", + "20:08:49 INFO loader :: Index SPO->POS: 6,000,000 slots (Batch: 819,672 slots/s / Avg: 827,700 slots/s)\n", + "20:08:49 INFO loader :: Elapsed: 61.46 seconds [2024/03/07 20:08:49 UTC]\n", + "20:08:49 INFO loader :: Index SPO->POS: 6,100,000 slots (Batch: 740,740 slots/s / Avg: 826,110 slots/s)\n", + "20:08:49 INFO loader :: Index SPO->POS: 6,200,000 slots (Batch: 724,637 slots/s / Avg: 824,248 slots/s)\n", + "20:08:49 INFO loader :: Index SPO->POS: 6,300,000 slots (Batch: 740,740 slots/s / Avg: 822,776 slots/s)\n", + "20:08:49 INFO loader :: Index SPO->POS: 6,400,000 slots (Batch: 724,637 slots/s / Avg: 821,039 slots/s)\n", + "20:08:49 INFO loader :: Index SPO->POS: 6,500,000 slots (Batch: 787,401 slots/s / Avg: 820,499 slots/s)\n", + "20:08:50 INFO loader :: Index SPO->POS: 6,600,000 slots (Batch: 751,879 slots/s / Avg: 819,366 slots/s)\n", + "20:08:50 INFO loader :: Index SPO->POS: 6,700,000 slots (Batch: 724,637 slots/s / Avg: 817,771 slots/s)\n", + "20:08:50 INFO loader :: Index SPO->POS: 6,800,000 slots (Batch: 724,637 slots/s / Avg: 816,228 slots/s)\n", + "20:08:50 INFO loader :: Index SPO->POS: 6,900,000 slots (Batch: 746,268 slots/s / Avg: 815,121 slots/s)\n", + "20:08:50 INFO loader :: Index SPO->POS: 7,000,000 slots (Batch: 781,250 slots/s / Avg: 814,616 slots/s)\n", + "20:08:50 INFO loader :: Elapsed: 62.80 seconds [2024/03/07 20:08:50 UTC]\n", + "20:08:50 INFO loader :: ** Index SPO->POS: 7,017,664 slots indexed in 8.61 seconds [Rate: 814,586.63 per second]\n", + "20:08:50 INFO loader :: Index SPO->OSP: 100,000 slots (Batch: 2,000,000 slots/s / Avg: 2,000,000 slots/s)\n", + "20:08:50 INFO loader :: Index SPO->OSP: 200,000 slots (Batch: 1,612,903 slots/s / Avg: 1,785,714 slots/s)\n", + "20:08:50 INFO loader :: Index SPO->OSP: 300,000 slots (Batch: 1,515,151 slots/s / Avg: 1,685,393 slots/s)\n", + "20:08:50 INFO loader :: Index SPO->OSP: 400,000 slots (Batch: 1,470,588 slots/s / Avg: 1,626,016 slots/s)\n", + "20:08:50 INFO loader :: Index SPO->OSP: 500,000 slots (Batch: 1,470,588 slots/s / Avg: 1,592,356 slots/s)\n", + "20:08:50 INFO loader :: Index SPO->OSP: 600,000 slots (Batch: 1,333,333 slots/s / Avg: 1,542,416 slots/s)\n", + "20:08:51 INFO loader :: Index SPO->OSP: 700,000 slots (Batch: 1,265,822 slots/s / Avg: 1,495,726 slots/s)\n", + "20:08:51 INFO loader :: Index SPO->OSP: 800,000 slots (Batch: 1,219,512 slots/s / Avg: 1,454,545 slots/s)\n", + "20:08:51 INFO loader :: Index SPO->OSP: 900,000 slots (Batch: 1,250,000 slots/s / Avg: 1,428,571 slots/s)\n", + "20:08:51 INFO loader :: Index SPO->OSP: 1,000,000 slots (Batch: 1,098,901 slots/s / Avg: 1,386,962 slots/s)\n", + "20:08:51 INFO loader :: Elapsed: 63.54 seconds [2024/03/07 20:08:51 UTC]\n", + "20:08:51 INFO loader :: Index SPO->OSP: 1,100,000 slots (Batch: 1,063,829 slots/s / Avg: 1,349,693 slots/s)\n", + "20:08:51 INFO loader :: Index SPO->OSP: 1,200,000 slots (Batch: 1,123,595 slots/s / Avg: 1,327,433 slots/s)\n", + "20:08:51 INFO loader :: Index SPO->OSP: 1,300,000 slots (Batch: 1,123,595 slots/s / Avg: 1,309,164 slots/s)\n", + "20:08:51 INFO loader :: Index SPO->OSP: 1,400,000 slots (Batch: 1,086,956 slots/s / Avg: 1,290,322 slots/s)\n", + "20:08:51 INFO loader :: Index SPO->OSP: 1,500,000 slots (Batch: 1,041,666 slots/s / Avg: 1,270,110 slots/s)\n", + "20:08:51 INFO loader :: Index SPO->OSP: 1,600,000 slots (Batch: 1,149,425 slots/s / Avg: 1,261,829 slots/s)\n", + "20:08:51 INFO loader :: Index SPO->OSP: 1,700,000 slots (Batch: 1,075,268 slots/s / Avg: 1,249,081 slots/s)\n", + "20:08:52 INFO loader :: Index SPO->OSP: 1,800,000 slots (Batch: 1,000,000 slots/s / Avg: 1,232,032 slots/s)\n", + "20:08:52 INFO loader :: Index SPO->OSP: 1,900,000 slots (Batch: 1,041,666 slots/s / Avg: 1,220,295 slots/s)\n", + "20:08:52 INFO loader :: Index SPO->OSP: 2,000,000 slots (Batch: 1,041,666 slots/s / Avg: 1,209,921 slots/s)\n", + "20:08:52 INFO loader :: Elapsed: 64.47 seconds [2024/03/07 20:08:52 UTC]\n", + "20:08:52 INFO loader :: Index SPO->OSP: 2,100,000 slots (Batch: 990,099 slots/s / Avg: 1,197,263 slots/s)\n", + "20:08:52 INFO loader :: Index SPO->OSP: 2,200,000 slots (Batch: 952,380 slots/s / Avg: 1,183,431 slots/s)\n", + "20:08:52 INFO loader :: Index SPO->OSP: 2,300,000 slots (Batch: 1,010,101 slots/s / Avg: 1,174,668 slots/s)\n", + "20:08:52 INFO loader :: Index SPO->OSP: 2,400,000 slots (Batch: 961,538 slots/s / Avg: 1,163,918 slots/s)\n", + "20:08:52 INFO loader :: Index SPO->OSP: 2,500,000 slots (Batch: 934,579 slots/s / Avg: 1,152,604 slots/s)\n", + "20:08:52 INFO loader :: Index SPO->OSP: 2,600,000 slots (Batch: 1,030,927 slots/s / Avg: 1,147,396 slots/s)\n", + "20:08:52 INFO loader :: Index SPO->OSP: 2,700,000 slots (Batch: 1,075,268 slots/s / Avg: 1,144,552 slots/s)\n", + "20:08:53 INFO loader :: Index SPO->OSP: 2,800,000 slots (Batch: 970,873 slots/s / Avg: 1,137,286 slots/s)\n", + "20:08:53 INFO loader :: Index SPO->OSP: 2,900,000 slots (Batch: 934,579 slots/s / Avg: 1,128,843 slots/s)\n", + "20:08:53 INFO loader :: Index SPO->OSP: 3,000,000 slots (Batch: 990,099 slots/s / Avg: 1,123,595 slots/s)\n", + "20:08:53 INFO loader :: Elapsed: 65.49 seconds [2024/03/07 20:08:53 UTC]\n", + "20:08:53 INFO loader :: Index SPO->OSP: 3,100,000 slots (Batch: 961,538 slots/s / Avg: 1,117,519 slots/s)\n", + "20:08:53 INFO loader :: Index SPO->OSP: 3,200,000 slots (Batch: 884,955 slots/s / Avg: 1,108,417 slots/s)\n", + "20:08:53 INFO loader :: Index SPO->OSP: 3,300,000 slots (Batch: 892,857 slots/s / Avg: 1,100,366 slots/s)\n", + "20:08:53 INFO loader :: Index SPO->OSP: 3,400,000 slots (Batch: 925,925 slots/s / Avg: 1,094,303 slots/s)\n", + "20:08:53 INFO loader :: Index SPO->OSP: 3,500,000 slots (Batch: 892,857 slots/s / Avg: 1,087,294 slots/s)\n", + "20:08:53 INFO loader :: Index SPO->OSP: 3,600,000 slots (Batch: 961,538 slots/s / Avg: 1,083,358 slots/s)\n", + "20:08:54 INFO loader :: Index SPO->OSP: 3,700,000 slots (Batch: 763,358 slots/s / Avg: 1,071,221 slots/s)\n", + "20:08:54 INFO loader :: Index SPO->OSP: 3,800,000 slots (Batch: 925,925 slots/s / Avg: 1,066,816 slots/s)\n", + "20:08:54 INFO loader :: Index SPO->OSP: 3,900,000 slots (Batch: 884,955 slots/s / Avg: 1,061,224 slots/s)\n", + "20:08:54 INFO loader :: Index SPO->OSP: 4,000,000 slots (Batch: 884,955 slots/s / Avg: 1,055,966 slots/s)\n", + "20:08:54 INFO loader :: Elapsed: 66.61 seconds [2024/03/07 20:08:54 UTC]\n", + "20:08:54 INFO loader :: Index SPO->OSP: 4,100,000 slots (Batch: 632,911 slots/s / Avg: 1,039,026 slots/s)\n", + "20:08:54 INFO loader :: Index SPO->OSP: 4,200,000 slots (Batch: 909,090 slots/s / Avg: 1,035,502 slots/s)\n", + "20:08:54 INFO loader :: Index SPO->OSP: 4,300,000 slots (Batch: 877,192 slots/s / Avg: 1,031,175 slots/s)\n", + "20:08:54 INFO loader :: Index SPO->OSP: 4,400,000 slots (Batch: 952,380 slots/s / Avg: 1,029,239 slots/s)\n", + "20:08:54 INFO loader :: Index SPO->OSP: 4,500,000 slots (Batch: 943,396 slots/s / Avg: 1,027,162 slots/s)\n", + "20:08:55 INFO loader :: Index SPO->OSP: 4,600,000 slots (Batch: 943,396 slots/s / Avg: 1,025,183 slots/s)\n", + "20:08:55 INFO loader :: Index SPO->OSP: 4,700,000 slots (Batch: 925,925 slots/s / Avg: 1,022,850 slots/s)\n", + "20:08:55 INFO loader :: Index SPO->OSP: 4,800,000 slots (Batch: 884,955 slots/s / Avg: 1,019,541 slots/s)\n", + "20:08:55 INFO loader :: Index SPO->OSP: 4,900,000 slots (Batch: 869,565 slots/s / Avg: 1,015,965 slots/s)\n", + "20:08:55 INFO loader :: Index SPO->OSP: 5,000,000 slots (Batch: 909,090 slots/s / Avg: 1,013,581 slots/s)\n", + "20:08:55 INFO loader :: Elapsed: 67.75 seconds [2024/03/07 20:08:55 UTC]\n", + "20:08:55 INFO loader :: Index SPO->OSP: 5,100,000 slots (Batch: 892,857 slots/s / Avg: 1,010,901 slots/s)\n", + "20:08:55 INFO loader :: Index SPO->OSP: 5,200,000 slots (Batch: 862,068 slots/s / Avg: 1,007,556 slots/s)\n", + "20:08:55 INFO loader :: Index SPO->OSP: 5,300,000 slots (Batch: 917,431 slots/s / Avg: 1,005,692 slots/s)\n", + "20:08:55 INFO loader :: Index SPO->OSP: 5,400,000 slots (Batch: 925,925 slots/s / Avg: 1,004,090 slots/s)\n", + "20:08:56 INFO loader :: Index SPO->OSP: 5,500,000 slots (Batch: 925,925 slots/s / Avg: 1,002,551 slots/s)\n", + "20:08:56 INFO loader :: Index SPO->OSP: 5,600,000 slots (Batch: 952,380 slots/s / Avg: 1,001,609 slots/s)\n", + "20:08:56 INFO loader :: Index SPO->OSP: 5,700,000 slots (Batch: 869,565 slots/s / Avg: 998,948 slots/s)\n", + "20:08:56 INFO loader :: Index SPO->OSP: 5,800,000 slots (Batch: 862,068 slots/s / Avg: 996,221 slots/s)\n", + "20:08:56 INFO loader :: Index SPO->OSP: 5,900,000 slots (Batch: 892,857 slots/s / Avg: 994,270 slots/s)\n", + "20:08:56 INFO loader :: Index SPO->OSP: 6,000,000 slots (Batch: 854,700 slots/s / Avg: 991,571 slots/s)\n", + "20:08:56 INFO loader :: Elapsed: 68.87 seconds [2024/03/07 20:08:56 UTC]\n", + "20:08:56 INFO loader :: Index SPO->OSP: 6,100,000 slots (Batch: 892,857 slots/s / Avg: 989,777 slots/s)\n", + "20:08:56 INFO loader :: Index SPO->OSP: 6,200,000 slots (Batch: 943,396 slots/s / Avg: 988,993 slots/s)\n", + "20:08:56 INFO loader :: Index SPO->OSP: 6,300,000 slots (Batch: 925,925 slots/s / Avg: 987,925 slots/s)\n", + "20:08:57 INFO loader :: Index SPO->OSP: 6,400,000 slots (Batch: 952,380 slots/s / Avg: 987,349 slots/s)\n", + "20:08:57 INFO loader :: Index SPO->OSP: 6,500,000 slots (Batch: 934,579 slots/s / Avg: 986,492 slots/s)\n", + "20:08:57 INFO loader :: Index SPO->OSP: 6,600,000 slots (Batch: 862,068 slots/s / Avg: 984,340 slots/s)\n", + "20:08:57 INFO loader :: Index SPO->OSP: 6,700,000 slots (Batch: 869,565 slots/s / Avg: 982,404 slots/s)\n", + "20:08:57 INFO loader :: Index SPO->OSP: 6,800,000 slots (Batch: 847,457 slots/s / Avg: 980,109 slots/s)\n", + "20:08:57 INFO loader :: Index SPO->OSP: 6,900,000 slots (Batch: 862,068 slots/s / Avg: 978,168 slots/s)\n", + "20:08:57 INFO loader :: Index SPO->OSP: 7,000,000 slots (Batch: 862,068 slots/s / Avg: 976,290 slots/s)\n", + "20:08:57 INFO loader :: Elapsed: 69.99 seconds [2024/03/07 20:08:57 UTC]\n", + "20:08:57 INFO loader :: ** Index SPO->OSP: 7,017,664 slots indexed in 7.19 seconds [Rate: 976,166.94 per second]\n", + "20:08:57 INFO loader :: -- Finish triples index phase\n", + "20:08:57 INFO loader :: ** 7,017,664 triples indexed in 15.81 seconds [Rate: 443,987.34 per second]\n", + "20:08:57 INFO loader :: -- Finish triples load\n", + "20:08:57 INFO loader :: ** Completed: 7,017,664 triples loaded in 70.01 seconds [Rate: 100,238.02 per second]\n", + "20:08:57 INFO loader :: -- Finish quads load\n" + ] + } + ], "source": [ "!docker compose run fuseki ./apache-jena-4.9.0/bin/tdbloader --loc /fuseki-base/nmdc-db.tdb /fuseki-base/nmdc-db.nt.gz" ] @@ -420,10 +2184,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "1a0bfb4b-e694-40b1-88af-4446e3fcc888", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n", + " \u001b[32m✔\u001b[0m Container fuseki \u001b[32mCreated\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 1/1\u001b[0m\n", + " \u001b[32m✔\u001b[0m Container fuseki \u001b[32mCreated\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 1/1\u001b[0m\n", + " \u001b[32m✔\u001b[0m Container fuseki \u001b[32mStarted\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h" + ] + } + ], "source": [ "!docker compose up fuseki -d" ] From 52df39e1048ea4a352cbf17fd8ad93aa3bc01c20 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Tue, 12 Mar 2024 10:24:37 -0400 Subject: [PATCH 06/18] feat(graph): nmdc:depends_on --- .../notebooks/ghissue_401_sparql.ipynb | 1001 ++++++++++------- 1 file changed, 587 insertions(+), 414 deletions(-) diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb index ef4fab18..4c89ee9f 100644 --- a/metadata-translation/notebooks/ghissue_401_sparql.ipynb +++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb @@ -141,7 +141,9 @@ "cell_type": "code", "execution_count": 5, "id": "9ed72826-b552-4429-8ab5-9f7126821822", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", @@ -1412,13 +1414,32 @@ { "cell_type": "code", "execution_count": 9, + "id": "86ff7261-e255-415d-a589-67637292dbdd", + "metadata": {}, + "outputs": [], + "source": [ + "from nmdc_runtime.util import collection_name_to_class_names\n", + "\n", + "def ensure_type(doc, collection_name):\n", + " if \"type\" in doc:\n", + " return doc\n", + "\n", + " class_names = collection_name_to_class_names[collection_name]\n", + " if len(class_names) > 1:\n", + " raise Exception(\"cannot unambiguously infer class of document\")\n", + " return assoc(doc, \"type\", class_names[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, "id": "4251e0b1-35dc-4f40-91e7-b9bc0d9d79e1", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "911ca45777004986ba5c7e8328f1373e", + "model_id": "2bc6ca60aa784afa8a3e8c1947be3901", "version_major": 2, "version_minor": 0 }, @@ -1465,11 +1486,12 @@ "\n", "pbar = tqdm(total=total)\n", "\n", - "for name in populated_collections:\n", - " print(name)\n", - " docs = [dissoc(doc, \"_id\") for doc in mdb[name].find()]\n", + "for collection_name in populated_collections:\n", + " print(collection_name)\n", + " docs = [dissoc(doc, \"_id\") for doc in mdb[collection_name].find()]\n", " chunks = list(split_chunk(docs, chunk_size))\n", " for chunk in chunks:\n", + " typed_chunk = [ensure_type(doc, collection_name) for doc in chunk]\n", " doc_jsonld = {\"@context\": context, \"@graph\": chunk}\n", " g.parse(data=json.dumps(doc_jsonld), format='json-ld')\n", " pbar.update(1)\n", @@ -1486,7 +1508,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "ba832848-2cc9-4d1d-bf5f-966a73e26658", "metadata": {}, "outputs": [], @@ -1521,7 +1543,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "831cbf19-8331-4f2d-814c-89d86d060029", "metadata": {}, "outputs": [ @@ -1560,7 +1582,7 @@ " 'WorkflowExecutionActivity'}" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1587,7 +1609,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "d402b739-4ab8-4d93-b00f-76f677313c66", "metadata": {}, "outputs": [ @@ -1595,7 +1617,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'part_of', 'has_input', 'was_informed_by', 'has_output', 'was_generated_by', 'collected_from', 'metagenome_annotation_id'}\n" + "{'was_informed_by', 'collected_from', 'part_of', 'metagenome_annotation_id', 'was_generated_by', 'has_input', 'has_output'}\n" ] } ], @@ -1610,39 +1632,202 @@ "print(toplevel_entity_connectors)" ] }, + { + "cell_type": "markdown", + "id": "40e58127-013e-40e2-a839-c9317e14c488", + "metadata": {}, + "source": [ + "Let's construct an entity-relationship diagram to visualize relationships." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c99cdd8d-5fd2-44eb-9090-af6f51770fbd", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "classDiagram\n", + "\n", + "Activity --> Activity : was_informed_by\n", + "Biosample --> FieldResearchSite : collected_from\n", + "NamedThing --> NamedThing : part_of\n", + "FunctionalAnnotationAggMember --> WorkflowExecutionActivity : metagenome_annotation_id\n", + "NamedThing --> Activity : was_generated_by\n", + "NamedThing --> NamedThing : has_input\n", + "NamedThing --> NamedThing : has_output\n", + "\n", + "PlannedProcess <|-- BiosampleProcessing\n", + "NamedThing <|-- Extraction\n", + "Activity <|-- NomAnalysisActivity\n", + "Activity <|-- MagsAnalysisActivity\n", + "NamedThing <|-- ProcessedSample\n", + "NamedThing <|-- Site\n", + "PlannedProcess <|-- Extraction\n", + "Activity <|-- MetagenomeAnnotationActivity\n", + "Activity <|-- MetatranscriptomeActivity\n", + "Activity <|-- MetabolomicsAnalysisActivity\n", + "WorkflowExecutionActivity <|-- MetabolomicsAnalysisActivity\n", + "NamedThing <|-- LibraryPreparation\n", + "Activity <|-- MetagenomeAssembly\n", + "WorkflowExecutionActivity <|-- NomAnalysisActivity\n", + "NamedThing <|-- Pooling\n", + "WorkflowExecutionActivity <|-- ReadQcAnalysisActivity\n", + "NamedThing <|-- Biosample\n", + "NamedThing <|-- FieldResearchSite\n", + "MaterialEntity <|-- FieldResearchSite\n", + "NamedThing <|-- PlannedProcess\n", + "NamedThing <|-- BiosampleProcessing\n", + "Site <|-- FieldResearchSite\n", + "Activity <|-- ReadQcAnalysisActivity\n", + "NamedThing <|-- Study\n", + "PlannedProcess <|-- OmicsProcessing\n", + "Activity <|-- ReadBasedTaxonomyAnalysisActivity\n", + "WorkflowExecutionActivity <|-- MetagenomeAnnotationActivity\n", + "WorkflowExecutionActivity <|-- MagsAnalysisActivity\n", + "NamedThing <|-- MaterialEntity\n", + "WorkflowExecutionActivity <|-- MetaproteomicsAnalysisActivity\n", + "BiosampleProcessing <|-- LibraryPreparation\n", + "NamedThing <|-- DataObject\n", + "MaterialEntity <|-- ProcessedSample\n", + "MaterialEntity <|-- Site\n", + "WorkflowExecutionActivity <|-- MetagenomeAssembly\n", + "WorkflowExecutionActivity <|-- ReadBasedTaxonomyAnalysisActivity\n", + "NamedThing <|-- CollectingBiosamplesFromSite\n", + "BiosampleProcessing <|-- Pooling\n", + "PlannedProcess <|-- CollectingBiosamplesFromSite\n", + "Activity <|-- MetaproteomicsAnalysisActivity\n", + "NamedThing <|-- OmicsProcessing\n", + "Activity <|-- MetagenomeSequencingActivity\n", + "MaterialEntity <|-- Biosample\n", + "WorkflowExecutionActivity <|-- MetatranscriptomeActivity\n", + "PlannedProcess <|-- LibraryPreparation\n", + "WorkflowExecutionActivity <|-- MetagenomeSequencingActivity\n", + "PlannedProcess <|-- Pooling\n", + "Activity <|-- WorkflowExecutionActivity\n" + ] + } + ], + "source": [ + "print(\"classDiagram\\n\")\n", + "for slot_name in toplevel_entity_connectors:\n", + " slot = slots[slot_name]\n", + " domain = slot.domain or \"NamedThing\"\n", + " range = slot.range\n", + " print(f\"{domain} --> {range} : {slot_name}\")\n", + "\n", + "print()\n", + "\n", + "inheritance_links = set()\n", + "for cls in toplevel_classes:\n", + " ancestors = schema_view.class_ancestors(cls)\n", + " for a in ancestors:\n", + " if a != cls:\n", + " inheritance_links.add(f\"{a} <|-- {cls}\")\n", + "\n", + "for link in inheritance_links:\n", + " print(link)" + ] + }, { "cell_type": "markdown", "id": "63cb2cc8-ef99-4d5f-9ddf-9eb2949e9c06", "metadata": {}, "source": [ - "Now, let's assert a common symmetric relation for all entities connected by these slots so that we can traverse the graph of top-level entities without needing to specify any specific slot names." + "Now, let's assert a common `depends_on` relation for all entities connected by these slots so that we can traverse the graph of top-level entities without needing to specify any specific slot names." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "id": "cc830d77-5ac2-482e-a4f9-dc2eed3f2ef9", "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2e6e481cc9a142598b447b765c7a4773", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/6784535 [00:00POS: 100,000 slots (Batch: 1,388,888 slots/s / Avg: 1,388,888 slots/s)\n", - "20:08:42 INFO loader :: Index SPO->POS: 200,000 slots (Batch: 1,265,822 slots/s / Avg: 1,324,503 slots/s)\n", - "20:08:42 INFO loader :: Index SPO->POS: 300,000 slots (Batch: 1,219,512 slots/s / Avg: 1,287,553 slots/s)\n", - "20:08:42 INFO loader :: Index SPO->POS: 400,000 slots (Batch: 900,900 slots/s / Avg: 1,162,790 slots/s)\n", - "20:08:42 INFO loader :: Index SPO->POS: 500,000 slots (Batch: 1,136,363 slots/s / Avg: 1,157,407 slots/s)\n", - "20:08:42 INFO loader :: Index SPO->POS: 600,000 slots (Batch: 1,075,268 slots/s / Avg: 1,142,857 slots/s)\n", - "20:08:42 INFO loader :: Index SPO->POS: 700,000 slots (Batch: 840,336 slots/s / Avg: 1,086,956 slots/s)\n", - "20:08:42 INFO loader :: Index SPO->POS: 800,000 slots (Batch: 1,111,111 slots/s / Avg: 1,089,918 slots/s)\n", - "20:08:42 INFO loader :: Index SPO->POS: 900,000 slots (Batch: 1,098,901 slots/s / Avg: 1,090,909 slots/s)\n", - "20:08:42 INFO loader :: Index SPO->POS: 1,000,000 slots (Batch: 900,900 slots/s / Avg: 1,068,376 slots/s)\n", - "20:08:42 INFO loader :: Elapsed: 55.14 seconds [2024/03/07 20:08:42 UTC]\n", - "20:08:42 INFO loader :: Index SPO->POS: 1,100,000 slots (Batch: 1,123,595 slots/s / Avg: 1,073,170 slots/s)\n", - "20:08:43 INFO loader :: Index SPO->POS: 1,200,000 slots (Batch: 1,086,956 slots/s / Avg: 1,074,306 slots/s)\n", - "20:08:43 INFO loader :: Index SPO->POS: 1,300,000 slots (Batch: 787,401 slots/s / Avg: 1,045,016 slots/s)\n", - "20:08:43 INFO loader :: Index SPO->POS: 1,400,000 slots (Batch: 1,000,000 slots/s / Avg: 1,041,666 slots/s)\n", - "20:08:43 INFO loader :: Index SPO->POS: 1,500,000 slots (Batch: 917,431 slots/s / Avg: 1,032,346 slots/s)\n", - "20:08:43 INFO loader :: Index SPO->POS: 1,600,000 slots (Batch: 632,911 slots/s / Avg: 993,171 slots/s)\n", - "20:08:43 INFO loader :: Index SPO->POS: 1,700,000 slots (Batch: 657,894 slots/s / Avg: 964,265 slots/s)\n", - "20:08:43 INFO loader :: Index SPO->POS: 1,800,000 slots (Batch: 952,380 slots/s / Avg: 963,597 slots/s)\n", - "20:08:43 INFO loader :: Index SPO->POS: 1,900,000 slots (Batch: 952,380 slots/s / Avg: 963,000 slots/s)\n", - "20:08:44 INFO loader :: Index SPO->POS: 2,000,000 slots (Batch: 909,090 slots/s / Avg: 960,153 slots/s)\n", - "20:08:44 INFO loader :: Elapsed: 56.29 seconds [2024/03/07 20:08:44 UTC]\n", - "20:08:44 INFO loader :: Index SPO->POS: 2,100,000 slots (Batch: 793,650 slots/s / Avg: 950,656 slots/s)\n", - "20:08:44 INFO loader :: Index SPO->POS: 2,200,000 slots (Batch: 862,068 slots/s / Avg: 946,236 slots/s)\n", - "20:08:44 INFO loader :: Index SPO->POS: 2,300,000 slots (Batch: 854,700 slots/s / Avg: 941,850 slots/s)\n", - "20:08:44 INFO loader :: Index SPO->POS: 2,400,000 slots (Batch: 709,219 slots/s / Avg: 929,152 slots/s)\n", - "20:08:44 INFO loader :: Index SPO->POS: 2,500,000 slots (Batch: 793,650 slots/s / Avg: 922,849 slots/s)\n", - "20:08:44 INFO loader :: Index SPO->POS: 2,600,000 slots (Batch: 819,672 slots/s / Avg: 918,403 slots/s)\n", - "20:08:44 INFO loader :: Index SPO->POS: 2,700,000 slots (Batch: 869,565 slots/s / Avg: 916,496 slots/s)\n", - "20:08:45 INFO loader :: Index SPO->POS: 2,800,000 slots (Batch: 751,879 slots/s / Avg: 909,386 slots/s)\n", - "20:08:45 INFO loader :: Index SPO->POS: 2,900,000 slots (Batch: 917,431 slots/s / Avg: 909,661 slots/s)\n", - "20:08:45 INFO loader :: Index SPO->POS: 3,000,000 slots (Batch: 806,451 slots/s / Avg: 905,797 slots/s)\n", - "20:08:45 INFO loader :: Elapsed: 57.52 seconds [2024/03/07 20:08:45 UTC]\n", - "20:08:45 INFO loader :: Index SPO->POS: 3,100,000 slots (Batch: 793,650 slots/s / Avg: 901,687 slots/s)\n", - "20:08:45 INFO loader :: Index SPO->POS: 3,200,000 slots (Batch: 709,219 slots/s / Avg: 894,104 slots/s)\n", - "20:08:45 INFO loader :: Index SPO->POS: 3,300,000 slots (Batch: 781,250 slots/s / Avg: 890,207 slots/s)\n", - "20:08:45 INFO loader :: Index SPO->POS: 3,400,000 slots (Batch: 775,193 slots/s / Avg: 886,339 slots/s)\n", - "20:08:45 INFO loader :: Index SPO->POS: 3,500,000 slots (Batch: 628,930 slots/s / Avg: 876,095 slots/s)\n", - "20:08:46 INFO loader :: Index SPO->POS: 3,600,000 slots (Batch: 869,565 slots/s / Avg: 875,912 slots/s)\n", - "20:08:46 INFO loader :: Index SPO->POS: 3,700,000 slots (Batch: 862,068 slots/s / Avg: 875,532 slots/s)\n", - "20:08:46 INFO loader :: Index SPO->POS: 3,800,000 slots (Batch: 763,358 slots/s / Avg: 872,159 slots/s)\n", - "20:08:46 INFO loader :: Index SPO->POS: 3,900,000 slots (Batch: 694,444 slots/s / Avg: 866,474 slots/s)\n", - "20:08:46 INFO loader :: Index SPO->POS: 4,000,000 slots (Batch: 787,401 slots/s / Avg: 864,304 slots/s)\n", - "20:08:46 INFO loader :: Elapsed: 58.83 seconds [2024/03/07 20:08:46 UTC]\n", - "20:08:46 INFO loader :: Index SPO->POS: 4,100,000 slots (Batch: 769,230 slots/s / Avg: 861,706 slots/s)\n", - "20:08:46 INFO loader :: Index SPO->POS: 4,200,000 slots (Batch: 800,000 slots/s / Avg: 860,126 slots/s)\n", - "20:08:46 INFO loader :: Index SPO->POS: 4,300,000 slots (Batch: 719,424 slots/s / Avg: 856,232 slots/s)\n", - "20:08:47 INFO loader :: Index SPO->POS: 4,400,000 slots (Batch: 813,008 slots/s / Avg: 855,199 slots/s)\n", - "20:08:47 INFO loader :: Index SPO->POS: 4,500,000 slots (Batch: 800,000 slots/s / Avg: 853,889 slots/s)\n", - "20:08:47 INFO loader :: Index SPO->POS: 4,600,000 slots (Batch: 735,294 slots/s / Avg: 850,906 slots/s)\n", - "20:08:47 INFO loader :: Index SPO->POS: 4,700,000 slots (Batch: 689,655 slots/s / Avg: 846,694 slots/s)\n", - "20:08:47 INFO loader :: Index SPO->POS: 4,800,000 slots (Batch: 769,230 slots/s / Avg: 844,921 slots/s)\n", - "20:08:47 INFO loader :: Index SPO->POS: 4,900,000 slots (Batch: 729,927 slots/s / Avg: 842,213 slots/s)\n", - "20:08:47 INFO loader :: Index SPO->POS: 5,000,000 slots (Batch: 769,230 slots/s / Avg: 840,618 slots/s)\n", - "20:08:47 INFO loader :: Elapsed: 60.15 seconds [2024/03/07 20:08:47 UTC]\n", - "20:08:48 INFO loader :: Index SPO->POS: 5,100,000 slots (Batch: 740,740 slots/s / Avg: 838,402 slots/s)\n", - "20:08:48 INFO loader :: Index SPO->POS: 5,200,000 slots (Batch: 781,250 slots/s / Avg: 837,224 slots/s)\n", - "20:08:48 INFO loader :: Index SPO->POS: 5,300,000 slots (Batch: 769,230 slots/s / Avg: 835,830 slots/s)\n", - "20:08:48 INFO loader :: Index SPO->POS: 5,400,000 slots (Batch: 729,927 slots/s / Avg: 833,590 slots/s)\n", - "20:08:48 INFO loader :: Index SPO->POS: 5,500,000 slots (Batch: 704,225 slots/s / Avg: 830,815 slots/s)\n", - "20:08:48 INFO loader :: Index SPO->POS: 5,600,000 slots (Batch: 775,193 slots/s / Avg: 829,752 slots/s)\n", - "20:08:48 INFO loader :: Index SPO->POS: 5,700,000 slots (Batch: 781,250 slots/s / Avg: 828,849 slots/s)\n", - "20:08:48 INFO loader :: Index SPO->POS: 5,800,000 slots (Batch: 819,672 slots/s / Avg: 828,689 slots/s)\n", - "20:08:49 INFO loader :: Index SPO->POS: 5,900,000 slots (Batch: 781,250 slots/s / Avg: 827,837 slots/s)\n", - "20:08:49 INFO loader :: Index SPO->POS: 6,000,000 slots (Batch: 819,672 slots/s / Avg: 827,700 slots/s)\n", - "20:08:49 INFO loader :: Elapsed: 61.46 seconds [2024/03/07 20:08:49 UTC]\n", - "20:08:49 INFO loader :: Index SPO->POS: 6,100,000 slots (Batch: 740,740 slots/s / Avg: 826,110 slots/s)\n", - "20:08:49 INFO loader :: Index SPO->POS: 6,200,000 slots (Batch: 724,637 slots/s / Avg: 824,248 slots/s)\n", - "20:08:49 INFO loader :: Index SPO->POS: 6,300,000 slots (Batch: 740,740 slots/s / Avg: 822,776 slots/s)\n", - "20:08:49 INFO loader :: Index SPO->POS: 6,400,000 slots (Batch: 724,637 slots/s / Avg: 821,039 slots/s)\n", - "20:08:49 INFO loader :: Index SPO->POS: 6,500,000 slots (Batch: 787,401 slots/s / Avg: 820,499 slots/s)\n", - "20:08:50 INFO loader :: Index SPO->POS: 6,600,000 slots (Batch: 751,879 slots/s / Avg: 819,366 slots/s)\n", - "20:08:50 INFO loader :: Index SPO->POS: 6,700,000 slots (Batch: 724,637 slots/s / Avg: 817,771 slots/s)\n", - "20:08:50 INFO loader :: Index SPO->POS: 6,800,000 slots (Batch: 724,637 slots/s / Avg: 816,228 slots/s)\n", - "20:08:50 INFO loader :: Index SPO->POS: 6,900,000 slots (Batch: 746,268 slots/s / Avg: 815,121 slots/s)\n", - "20:08:50 INFO loader :: Index SPO->POS: 7,000,000 slots (Batch: 781,250 slots/s / Avg: 814,616 slots/s)\n", - "20:08:50 INFO loader :: Elapsed: 62.80 seconds [2024/03/07 20:08:50 UTC]\n", - "20:08:50 INFO loader :: ** Index SPO->POS: 7,017,664 slots indexed in 8.61 seconds [Rate: 814,586.63 per second]\n", - "20:08:50 INFO loader :: Index SPO->OSP: 100,000 slots (Batch: 2,000,000 slots/s / Avg: 2,000,000 slots/s)\n", - "20:08:50 INFO loader :: Index SPO->OSP: 200,000 slots (Batch: 1,612,903 slots/s / Avg: 1,785,714 slots/s)\n", - "20:08:50 INFO loader :: Index SPO->OSP: 300,000 slots (Batch: 1,515,151 slots/s / Avg: 1,685,393 slots/s)\n", - "20:08:50 INFO loader :: Index SPO->OSP: 400,000 slots (Batch: 1,470,588 slots/s / Avg: 1,626,016 slots/s)\n", - "20:08:50 INFO loader :: Index SPO->OSP: 500,000 slots (Batch: 1,470,588 slots/s / Avg: 1,592,356 slots/s)\n", - "20:08:50 INFO loader :: Index SPO->OSP: 600,000 slots (Batch: 1,333,333 slots/s / Avg: 1,542,416 slots/s)\n", - "20:08:51 INFO loader :: Index SPO->OSP: 700,000 slots (Batch: 1,265,822 slots/s / Avg: 1,495,726 slots/s)\n", - "20:08:51 INFO loader :: Index SPO->OSP: 800,000 slots (Batch: 1,219,512 slots/s / Avg: 1,454,545 slots/s)\n", - "20:08:51 INFO loader :: Index SPO->OSP: 900,000 slots (Batch: 1,250,000 slots/s / Avg: 1,428,571 slots/s)\n", - "20:08:51 INFO loader :: Index SPO->OSP: 1,000,000 slots (Batch: 1,098,901 slots/s / Avg: 1,386,962 slots/s)\n", - "20:08:51 INFO loader :: Elapsed: 63.54 seconds [2024/03/07 20:08:51 UTC]\n", - "20:08:51 INFO loader :: Index SPO->OSP: 1,100,000 slots (Batch: 1,063,829 slots/s / Avg: 1,349,693 slots/s)\n", - "20:08:51 INFO loader :: Index SPO->OSP: 1,200,000 slots (Batch: 1,123,595 slots/s / Avg: 1,327,433 slots/s)\n", - "20:08:51 INFO loader :: Index SPO->OSP: 1,300,000 slots (Batch: 1,123,595 slots/s / Avg: 1,309,164 slots/s)\n", - "20:08:51 INFO loader :: Index SPO->OSP: 1,400,000 slots (Batch: 1,086,956 slots/s / Avg: 1,290,322 slots/s)\n", - "20:08:51 INFO loader :: Index SPO->OSP: 1,500,000 slots (Batch: 1,041,666 slots/s / Avg: 1,270,110 slots/s)\n", - "20:08:51 INFO loader :: Index SPO->OSP: 1,600,000 slots (Batch: 1,149,425 slots/s / Avg: 1,261,829 slots/s)\n", - "20:08:51 INFO loader :: Index SPO->OSP: 1,700,000 slots (Batch: 1,075,268 slots/s / Avg: 1,249,081 slots/s)\n", - "20:08:52 INFO loader :: Index SPO->OSP: 1,800,000 slots (Batch: 1,000,000 slots/s / Avg: 1,232,032 slots/s)\n", - "20:08:52 INFO loader :: Index SPO->OSP: 1,900,000 slots (Batch: 1,041,666 slots/s / Avg: 1,220,295 slots/s)\n", - "20:08:52 INFO loader :: Index SPO->OSP: 2,000,000 slots (Batch: 1,041,666 slots/s / Avg: 1,209,921 slots/s)\n", - "20:08:52 INFO loader :: Elapsed: 64.47 seconds [2024/03/07 20:08:52 UTC]\n", - "20:08:52 INFO loader :: Index SPO->OSP: 2,100,000 slots (Batch: 990,099 slots/s / Avg: 1,197,263 slots/s)\n", - "20:08:52 INFO loader :: Index SPO->OSP: 2,200,000 slots (Batch: 952,380 slots/s / Avg: 1,183,431 slots/s)\n", - "20:08:52 INFO loader :: Index SPO->OSP: 2,300,000 slots (Batch: 1,010,101 slots/s / Avg: 1,174,668 slots/s)\n", - "20:08:52 INFO loader :: Index SPO->OSP: 2,400,000 slots (Batch: 961,538 slots/s / Avg: 1,163,918 slots/s)\n", - "20:08:52 INFO loader :: Index SPO->OSP: 2,500,000 slots (Batch: 934,579 slots/s / Avg: 1,152,604 slots/s)\n", - "20:08:52 INFO loader :: Index SPO->OSP: 2,600,000 slots (Batch: 1,030,927 slots/s / Avg: 1,147,396 slots/s)\n", - "20:08:52 INFO loader :: Index SPO->OSP: 2,700,000 slots (Batch: 1,075,268 slots/s / Avg: 1,144,552 slots/s)\n", - "20:08:53 INFO loader :: Index SPO->OSP: 2,800,000 slots (Batch: 970,873 slots/s / Avg: 1,137,286 slots/s)\n", - "20:08:53 INFO loader :: Index SPO->OSP: 2,900,000 slots (Batch: 934,579 slots/s / Avg: 1,128,843 slots/s)\n", - "20:08:53 INFO loader :: Index SPO->OSP: 3,000,000 slots (Batch: 990,099 slots/s / Avg: 1,123,595 slots/s)\n", - "20:08:53 INFO loader :: Elapsed: 65.49 seconds [2024/03/07 20:08:53 UTC]\n", - "20:08:53 INFO loader :: Index SPO->OSP: 3,100,000 slots (Batch: 961,538 slots/s / Avg: 1,117,519 slots/s)\n", - "20:08:53 INFO loader :: Index SPO->OSP: 3,200,000 slots (Batch: 884,955 slots/s / Avg: 1,108,417 slots/s)\n", - "20:08:53 INFO loader :: Index SPO->OSP: 3,300,000 slots (Batch: 892,857 slots/s / Avg: 1,100,366 slots/s)\n", - "20:08:53 INFO loader :: Index SPO->OSP: 3,400,000 slots (Batch: 925,925 slots/s / Avg: 1,094,303 slots/s)\n", - "20:08:53 INFO loader :: Index SPO->OSP: 3,500,000 slots (Batch: 892,857 slots/s / Avg: 1,087,294 slots/s)\n", - "20:08:53 INFO loader :: Index SPO->OSP: 3,600,000 slots (Batch: 961,538 slots/s / Avg: 1,083,358 slots/s)\n", - "20:08:54 INFO loader :: Index SPO->OSP: 3,700,000 slots (Batch: 763,358 slots/s / Avg: 1,071,221 slots/s)\n", - "20:08:54 INFO loader :: Index SPO->OSP: 3,800,000 slots (Batch: 925,925 slots/s / Avg: 1,066,816 slots/s)\n", - "20:08:54 INFO loader :: Index SPO->OSP: 3,900,000 slots (Batch: 884,955 slots/s / Avg: 1,061,224 slots/s)\n", - "20:08:54 INFO loader :: Index SPO->OSP: 4,000,000 slots (Batch: 884,955 slots/s / Avg: 1,055,966 slots/s)\n", - "20:08:54 INFO loader :: Elapsed: 66.61 seconds [2024/03/07 20:08:54 UTC]\n", - "20:08:54 INFO loader :: Index SPO->OSP: 4,100,000 slots (Batch: 632,911 slots/s / Avg: 1,039,026 slots/s)\n", - "20:08:54 INFO loader :: Index SPO->OSP: 4,200,000 slots (Batch: 909,090 slots/s / Avg: 1,035,502 slots/s)\n", - "20:08:54 INFO loader :: Index SPO->OSP: 4,300,000 slots (Batch: 877,192 slots/s / Avg: 1,031,175 slots/s)\n", - "20:08:54 INFO loader :: Index SPO->OSP: 4,400,000 slots (Batch: 952,380 slots/s / Avg: 1,029,239 slots/s)\n", - "20:08:54 INFO loader :: Index SPO->OSP: 4,500,000 slots (Batch: 943,396 slots/s / Avg: 1,027,162 slots/s)\n", - "20:08:55 INFO loader :: Index SPO->OSP: 4,600,000 slots (Batch: 943,396 slots/s / Avg: 1,025,183 slots/s)\n", - "20:08:55 INFO loader :: Index SPO->OSP: 4,700,000 slots (Batch: 925,925 slots/s / Avg: 1,022,850 slots/s)\n", - "20:08:55 INFO loader :: Index SPO->OSP: 4,800,000 slots (Batch: 884,955 slots/s / Avg: 1,019,541 slots/s)\n", - "20:08:55 INFO loader :: Index SPO->OSP: 4,900,000 slots (Batch: 869,565 slots/s / Avg: 1,015,965 slots/s)\n", - "20:08:55 INFO loader :: Index SPO->OSP: 5,000,000 slots (Batch: 909,090 slots/s / Avg: 1,013,581 slots/s)\n", - "20:08:55 INFO loader :: Elapsed: 67.75 seconds [2024/03/07 20:08:55 UTC]\n", - "20:08:55 INFO loader :: Index SPO->OSP: 5,100,000 slots (Batch: 892,857 slots/s / Avg: 1,010,901 slots/s)\n", - "20:08:55 INFO loader :: Index SPO->OSP: 5,200,000 slots (Batch: 862,068 slots/s / Avg: 1,007,556 slots/s)\n", - "20:08:55 INFO loader :: Index SPO->OSP: 5,300,000 slots (Batch: 917,431 slots/s / Avg: 1,005,692 slots/s)\n", - "20:08:55 INFO loader :: Index SPO->OSP: 5,400,000 slots (Batch: 925,925 slots/s / Avg: 1,004,090 slots/s)\n", - "20:08:56 INFO loader :: Index SPO->OSP: 5,500,000 slots (Batch: 925,925 slots/s / Avg: 1,002,551 slots/s)\n", - "20:08:56 INFO loader :: Index SPO->OSP: 5,600,000 slots (Batch: 952,380 slots/s / Avg: 1,001,609 slots/s)\n", - "20:08:56 INFO loader :: Index SPO->OSP: 5,700,000 slots (Batch: 869,565 slots/s / Avg: 998,948 slots/s)\n", - "20:08:56 INFO loader :: Index SPO->OSP: 5,800,000 slots (Batch: 862,068 slots/s / Avg: 996,221 slots/s)\n", - "20:08:56 INFO loader :: Index SPO->OSP: 5,900,000 slots (Batch: 892,857 slots/s / Avg: 994,270 slots/s)\n", - "20:08:56 INFO loader :: Index SPO->OSP: 6,000,000 slots (Batch: 854,700 slots/s / Avg: 991,571 slots/s)\n", - "20:08:56 INFO loader :: Elapsed: 68.87 seconds [2024/03/07 20:08:56 UTC]\n", - "20:08:56 INFO loader :: Index SPO->OSP: 6,100,000 slots (Batch: 892,857 slots/s / Avg: 989,777 slots/s)\n", - "20:08:56 INFO loader :: Index SPO->OSP: 6,200,000 slots (Batch: 943,396 slots/s / Avg: 988,993 slots/s)\n", - "20:08:56 INFO loader :: Index SPO->OSP: 6,300,000 slots (Batch: 925,925 slots/s / Avg: 987,925 slots/s)\n", - "20:08:57 INFO loader :: Index SPO->OSP: 6,400,000 slots (Batch: 952,380 slots/s / Avg: 987,349 slots/s)\n", - "20:08:57 INFO loader :: Index SPO->OSP: 6,500,000 slots (Batch: 934,579 slots/s / Avg: 986,492 slots/s)\n", - "20:08:57 INFO loader :: Index SPO->OSP: 6,600,000 slots (Batch: 862,068 slots/s / Avg: 984,340 slots/s)\n", - "20:08:57 INFO loader :: Index SPO->OSP: 6,700,000 slots (Batch: 869,565 slots/s / Avg: 982,404 slots/s)\n", - "20:08:57 INFO loader :: Index SPO->OSP: 6,800,000 slots (Batch: 847,457 slots/s / Avg: 980,109 slots/s)\n", - "20:08:57 INFO loader :: Index SPO->OSP: 6,900,000 slots (Batch: 862,068 slots/s / Avg: 978,168 slots/s)\n", - "20:08:57 INFO loader :: Index SPO->OSP: 7,000,000 slots (Batch: 862,068 slots/s / Avg: 976,290 slots/s)\n", - "20:08:57 INFO loader :: Elapsed: 69.99 seconds [2024/03/07 20:08:57 UTC]\n", - "20:08:57 INFO loader :: ** Index SPO->OSP: 7,017,664 slots indexed in 7.19 seconds [Rate: 976,166.94 per second]\n", - "20:08:57 INFO loader :: -- Finish triples index phase\n", - "20:08:57 INFO loader :: ** 7,017,664 triples indexed in 15.81 seconds [Rate: 443,987.34 per second]\n", - "20:08:57 INFO loader :: -- Finish triples load\n", - "20:08:57 INFO loader :: ** Completed: 7,017,664 triples loaded in 70.01 seconds [Rate: 100,238.02 per second]\n", - "20:08:57 INFO loader :: -- Finish quads load\n" + "18:36:29 INFO loader :: -- Start triples data phase\n", + "18:36:29 INFO loader :: ** Load empty triples table\n", + "18:36:29 INFO loader :: -- Start quads data phase\n", + "18:36:29 INFO loader :: ** Load empty quads table\n", + "18:36:29 INFO loader :: Load: /fuseki-base/nmdc-db.nt.gz -- 2024/03/11 18:36:29 UTC\n", + "18:36:29 WARN riot :: [line: 32152, col: 92] Bad IRI: Not a valid UUID string: uuid:CSF2-CB-T-6d29d97e-b8d7-4844-a8c3-cc181f4c9909\n", + "18:36:30 INFO loader :: Add: 100,000 triples (Batch: 85,178 / Avg: 85,178)\n", + "18:36:30 WARN riot :: [line: 115631, col: 92] Bad IRI: Not a valid UUID string: uuid:WSU2-CA-B-6ad1d2a3-0949-423a-a549-c801913e1d63\n", + "18:36:30 WARN riot :: [line: 147946, col: 92] Bad IRI: Not a valid UUID string: uuid:UNDE-CB-B-f1c58c85-e8dd-40ce-9f2f-0f30f101562c\n", + "18:36:30 WARN riot :: [line: 161322, col: 92] Bad IRI: Not a valid UUID string: uuid:ORNL-CB-T-349b5bdb-2a1b-4e74-aa7f-a5c973c1d12c\n", + "18:36:31 INFO loader :: Add: 200,000 triples (Batch: 144,300 / Avg: 107,123)\n", + "18:36:31 WARN riot :: [line: 256741, col: 92] Bad IRI: Not a valid UUID string: uuid:NWBA-CB-T-9c9b1f6c-5d5a-4c4a-a2bb-ab7c379844ee\n", + "18:36:31 INFO loader :: Add: 300,000 triples (Batch: 139,082 / Avg: 116,009)\n", + "18:36:32 WARN riot :: [line: 355349, col: 92] Bad IRI: Not a valid UUID string: uuid:WY01-CB-T-087448a0-5ec3-4d9e-8fea-1f3c06daabfb\n", + "18:36:32 WARN riot :: [line: 387079, col: 92] Bad IRI: Not a valid UUID string: uuid:WY15-CB-B-e7803387-76a7-45de-8ea1-d6c4146b8a81\n", + "18:36:32 INFO loader :: Add: 400,000 triples (Batch: 138,504 / Avg: 120,918)\n", + "18:36:32 WARN riot :: [line: 443369, col: 92] Bad IRI: Not a valid UUID string: uuid:WSU3-CB-B-c248f426-b677-44df-919e-a9bd18e6fa7b\n", + "18:36:33 INFO loader :: Add: 500,000 triples (Batch: 148,809 / Avg: 125,628)\n", + "18:36:33 WARN riot :: [line: 567871, col: 92] Bad IRI: Not a valid UUID string: uuid:SCBI-CB-B-120b7ecd-9603-4d23-9241-642a4ee95ddd\n", + "18:36:33 INFO loader :: Add: 600,000 triples (Batch: 134,952 / Avg: 127,091)\n", + "18:36:34 INFO loader :: Add: 700,000 triples (Batch: 136,054 / Avg: 128,299)\n", + "18:36:34 WARN riot :: [line: 726265, col: 92] Bad IRI: Not a valid UUID string: uuid:TREE-CB-B-f5b3d5de-f4d5-4e09-94e5-cdd343d22787\n", + "18:36:34 WARN riot :: [line: 726317, col: 92] Bad IRI: Not a valid UUID string: uuid:ORNL-CB-B-502e74cf-2cca-4d0c-800c-dc0580d8b54d\n", + "18:36:34 WARN riot :: [line: 740881, col: 92] Bad IRI: Not a valid UUID string: uuid:MLSB-CB-T-2ca22db1-1704-4174-9a45-69cc790ce9a5\n", + "18:36:35 WARN riot :: [line: 775240, col: 92] Bad IRI: Not a valid UUID string: uuid:NWBC-CB-B-06fd881f-fe3c-43f0-8f75-1fda0772ed35\n", + "18:36:35 INFO loader :: Add: 800,000 triples (Batch: 143,472 / Avg: 130,017)\n", + "18:36:35 WARN riot :: [line: 801689, col: 92] Bad IRI: Not a valid UUID string: uuid:NWBB-CB-B-7355ff1a-b15d-4b06-b9d1-79a293e32030\n", + "18:36:35 WARN riot :: [line: 808962, col: 92] Bad IRI: Not a valid UUID string: uuid:NIWO-CB-B-56096a54-0d7a-4838-9a98-0d68d8d73e0e\n", + "18:36:35 WARN riot :: [line: 813615, col: 92] Bad IRI: Not a valid UUID string: uuid:FTA3-CB-T-4116d3f6-5dfe-45c0-9989-6a6489238fbf\n", + "18:36:35 WARN riot :: [line: 828058, col: 92] Bad IRI: Not a valid UUID string: uuid:RMNP-CB-T-55f69c6a-3849-4e45-83ba-1cdfbefc3818\n", + "18:36:35 WARN riot :: [line: 875373, col: 92] Bad IRI: Not a valid UUID string: uuid:JORN-CB-B-a2dfa872-c0d4-49fe-a752-f24be601c13a\n", + "18:36:36 INFO loader :: Add: 900,000 triples (Batch: 137,362 / Avg: 130,794)\n", + "18:36:36 WARN riot :: [line: 912595, col: 92] Bad IRI: Not a valid UUID string: uuid:DCFS-CB-T-4ea961ae-3e1d-4e7f-aca8-615d23195a1a\n", + "18:36:36 WARN riot :: [line: 937846, col: 92] Bad IRI: Not a valid UUID string: uuid:CSF2-CB-B-c99abe38-cbf0-40d1-864a-6fb848212e8b\n", + "18:36:36 INFO loader :: Add: 1,000,000 triples (Batch: 141,442 / Avg: 131,787)\n", + "18:36:36 INFO loader :: Elapsed: 7.59 seconds [2024/03/11 18:36:36 UTC]\n", + "18:36:36 WARN riot :: [line: 1005402, col: 92] Bad IRI: Not a valid UUID string: uuid:UT12-CB-B-84465a7e-6d15-4aad-9d64-5345cb857450\n", + "18:36:36 WARN riot :: [line: 1011697, col: 92] Bad IRI: Not a valid UUID string: uuid:TEAK-CB-B-feb28ff2-0e1f-4be9-bd9a-f92521bd9d90\n", + "18:36:37 WARN riot :: [line: 1027988, col: 92] Bad IRI: Not a valid UUID string: uuid:OCTB-CB-B-40166778-406d-4f49-bcad-4555c3e038a8\n", + "18:36:37 WARN riot :: [line: 1072458, col: 92] Bad IRI: Not a valid UUID string: uuid:UT32-CB-B-145460ed-1b7d-4ee1-a90d-76f0b6ef62e8\n", + "18:36:37 INFO loader :: Add: 1,100,000 triples (Batch: 148,809 / Avg: 133,171)\n", + "18:36:37 WARN riot :: [line: 1115568, col: 92] Bad IRI: Not a valid UUID string: uuid:KONZ-CB-T-e70befb2-f3a7-4cd7-afac-ef817885fe44\n", + "18:36:38 WARN riot :: [line: 1192531, col: 92] Bad IRI: Not a valid UUID string: uuid:OCTB-CB-T-3fe4e64c-6555-4cf9-a1a1-267a3efd36c5\n", + "18:36:38 INFO loader :: Add: 1,200,000 triples (Batch: 156,739 / Avg: 134,861)\n", + "18:36:38 WARN riot :: [line: 1244047, col: 92] Bad IRI: Not a valid UUID string: uuid:SJER-CB-B-e7d303b7-60ac-426c-a2ad-4216273b3d4a\n", + "18:36:38 WARN riot :: [line: 1264549, col: 92] Bad IRI: Not a valid UUID string: uuid:PETF-CB-T-cef6133d-59d9-44f6-8c22-29664736616d\n", + "18:36:38 INFO loader :: Add: 1,300,000 triples (Batch: 151,057 / Avg: 135,983)\n", + "18:36:39 INFO loader :: Add: 1,400,000 triples (Batch: 152,905 / Avg: 137,066)\n", + "18:36:39 WARN riot :: [line: 1476007, col: 92] Bad IRI: Not a valid UUID string: uuid:WREF-CB-B-36d4dd11-2249-4825-b781-2feb79da2c29\n", + "18:36:40 WARN riot :: [line: 1486998, col: 92] Bad IRI: Not a valid UUID string: uuid:KONA-CB-B-504c5931-f7cf-47aa-a2c0-ce90bcb3a0e3\n", + "18:36:40 WARN riot :: [line: 1488177, col: 92] Bad IRI: Not a valid UUID string: uuid:SERC-CB-T-a591f903-459e-409e-b1e8-f1a213d60bce\n", + "18:36:40 INFO loader :: Add: 1,500,000 triples (Batch: 147,058 / Avg: 137,690)\n", + "18:36:40 WARN riot :: [line: 1554300, col: 92] Bad IRI: Not a valid UUID string: uuid:NWBA-CB-B-1bfcbda1-368e-4963-b502-698284f44440\n", + "18:36:40 INFO loader :: Add: 1,600,000 triples (Batch: 157,977 / Avg: 138,804)\n", + "18:36:40 WARN riot :: [line: 1614508, col: 92] Bad IRI: Not a valid UUID string: uuid:PPRH-CB-B-40a7fdcf-7d5a-4d3d-ae81-42253fc94370\n", + "18:36:41 WARN riot :: [line: 1667631, col: 92] Bad IRI: Not a valid UUID string: uuid:SOAP-CB-T-e0235661-d3be-4195-bab0-105df1d4b9b8\n", + "18:36:41 INFO loader :: Add: 1,700,000 triples (Batch: 147,275 / Avg: 139,275)\n", + "18:36:41 WARN riot :: [line: 1701276, col: 92] Bad IRI: Not a valid UUID string: uuid:PHTU-CB-B-c68c9d04-9501-416d-9150-f647665fe15b\n", + "18:36:41 WARN riot :: [line: 1708816, col: 92] Bad IRI: Not a valid UUID string: uuid:STEI-CB-B-d3aa5d85-ac22-4dfb-9133-8b10ee3aaa09\n", + "18:36:41 WARN riot :: [line: 1714281, col: 92] Bad IRI: Not a valid UUID string: uuid:OAES-CB-B-f1f5c32d-8cfb-4d78-ac69-dd89510c8d03\n", + "18:36:42 WARN riot :: [line: 1784097, col: 92] Bad IRI: Not a valid UUID string: uuid:SOAP-CB-B-3b4186ac-3481-4959-82a6-420961ffb8c3\n", + "18:36:42 INFO loader :: Add: 1,800,000 triples (Batch: 148,809 / Avg: 139,773)\n", + "18:36:42 WARN riot :: [line: 1863809, col: 92] Bad IRI: Not a valid UUID string: uuid:KONZ-CB-B-ea6e2ff0-c546-49dd-9f72-c762a6db55cb\n", + "18:36:42 INFO loader :: Add: 1,900,000 triples (Batch: 146,842 / Avg: 140,128)\n", + "18:36:42 WARN riot :: [line: 1915481, col: 92] Bad IRI: Not a valid UUID string: uuid:TEAK-CB-T-0d2245d4-c6da-4723-95be-ca5aefe607de\n", + "18:36:43 WARN riot :: [line: 1974175, col: 92] Bad IRI: Not a valid UUID string: uuid:ISNC-CB-T-4eb8355f-18a1-4efc-8524-506e64f0937d\n", + "18:36:43 INFO loader :: Add: 2,000,000 triples (Batch: 154,320 / Avg: 140,775)\n", + "18:36:43 INFO loader :: Elapsed: 14.21 seconds [2024/03/11 18:36:43 UTC]\n", + "18:36:43 WARN riot :: [line: 2068938, col: 92] Bad IRI: Not a valid UUID string: uuid:MOAB-CB-B-03a9d128-c9b5-4bf8-a6cc-0757a0c5af8a\n", + "18:36:44 INFO loader :: Add: 2,100,000 triples (Batch: 147,710 / Avg: 141,091)\n", + "18:36:44 WARN riot :: [line: 2194904, col: 92] Bad IRI: Not a valid UUID string: uuid:UT19-CB-B-7641dbf9-c478-4252-b2ef-dc6fc4c2baf7\n", + "18:36:44 INFO loader :: Add: 2,200,000 triples (Batch: 148,367 / Avg: 141,406)\n", + "18:36:44 WARN riot :: [line: 2223567, col: 92] Bad IRI: Not a valid UUID string: uuid:ANZA-CB-T-a11a1232-abdc-4192-b792-f10d418ad550\n", + "18:36:45 INFO loader :: Add: 2,300,000 triples (Batch: 148,367 / Avg: 141,695)\n", + "18:36:45 WARN riot :: [line: 2364508, col: 92] Bad IRI: Not a valid UUID string: uuid:OSBS-CB-B-330d6d90-17b2-4452-ba83-6e870e8cad10\n", + "18:36:46 INFO loader :: Add: 2,400,000 triples (Batch: 153,846 / Avg: 142,163)\n", + "18:36:46 INFO loader :: Add: 2,500,000 triples (Batch: 147,710 / Avg: 142,377)\n", + "18:36:47 WARN riot :: [line: 2538674, col: 92] Bad IRI: Not a valid UUID string: uuid:GRSM-CB-T-fe711015-d950-4b51-98a3-5308f92ec8c7\n", + "18:36:47 WARN riot :: [line: 2569882, col: 92] Bad IRI: Not a valid UUID string: uuid:RMNP-CB-B-f0507935-ba93-42db-9f27-9383e7932ac3\n", + "18:36:47 INFO loader :: Add: 2,600,000 triples (Batch: 145,985 / Avg: 142,512)\n", + "18:36:47 WARN riot :: [line: 2608185, col: 92] Bad IRI: Not a valid UUID string: uuid:WY01-CB-B-1db0d919-0432-46d3-8835-8c2ec8cf4132\n", + "18:36:47 WARN riot :: [line: 2616344, col: 92] Bad IRI: Not a valid UUID string: uuid:JORN-CB-T-ec1b45a6-c27d-4fb2-8d1f-2b1978300df0\n", + "18:36:47 WARN riot :: [line: 2622116, col: 92] Bad IRI: Not a valid UUID string: uuid:WY03-CB-T-2d5a8790-1c24-4059-97e9-668b51bf544d\n", + "18:36:48 WARN riot :: [line: 2688416, col: 92] Bad IRI: Not a valid UUID string: uuid:SERC-CB-B-a13e7129-c37b-449a-8172-bf04166ae49e\n", + "18:36:48 WARN riot :: [line: 2699314, col: 92] Bad IRI: Not a valid UUID string: uuid:GRSM-CB-B-a59d0367-64bc-4f5e-929d-805ff0b1bc0d\n", + "18:36:48 INFO loader :: Add: 2,700,000 triples (Batch: 145,348 / Avg: 142,615)\n", + "18:36:48 WARN riot :: [line: 2738345, col: 92] Bad IRI: Not a valid UUID string: uuid:LENO-CB-T-a3029b48-f488-41b7-901f-8696e7c46046\n", + "18:36:48 WARN riot :: [line: 2745497, col: 92] Bad IRI: Not a valid UUID string: uuid:PSR2-CB-B-a9123b93-226f-4ac9-86f6-da9030cb4603\n", + "18:36:48 INFO loader :: Add: 2,800,000 triples (Batch: 154,320 / Avg: 143,003)\n", + "18:36:48 WARN riot :: [line: 2807264, col: 92] Bad IRI: Not a valid UUID string: uuid:CPER-CB-B-22f145ec-fb3a-4b6e-bc1d-8de45d79b5a1\n", + "18:36:49 WARN riot :: [line: 2825183, col: 92] Bad IRI: Not a valid UUID string: uuid:PSR1-CB-B-60b61105-073e-4d04-b3dc-452479bbae4b\n", + "18:36:49 WARN riot :: [line: 2861733, col: 92] Bad IRI: Not a valid UUID string: uuid:DELA-CB-T-13ba6115-12fc-47cc-8cb0-ebf65e1d23d1\n", + "18:36:49 INFO loader :: Add: 2,900,000 triples (Batch: 144,092 / Avg: 143,040)\n", + "18:36:49 WARN riot :: [line: 2966452, col: 92] Bad IRI: Not a valid UUID string: uuid:SJER-CB-T-3b94b737-8ae6-452a-a916-4bac93f7a750\n", + "18:36:50 WARN riot :: [line: 2980905, col: 92] Bad IRI: Not a valid UUID string: uuid:ONAQ-CB-T-3294fa6a-beb1-4bb2-a4b3-e2e13b50bcc3\n", + "18:36:50 INFO loader :: Add: 3,000,000 triples (Batch: 144,927 / Avg: 143,102)\n", + "18:36:50 INFO loader :: Elapsed: 20.97 seconds [2024/03/11 18:36:50 UTC]\n", + "18:36:50 WARN riot :: [line: 3002262, col: 92] Bad IRI: Not a valid UUID string: uuid:NIWO-CB-T-8fa5540a-3ecc-43f3-8dea-9c9372d12826\n", + "18:36:50 WARN riot :: [line: 3024503, col: 92] Bad IRI: Not a valid UUID string: uuid:WOOD-CB-B-8e2da002-85ce-474f-a13d-561087b17bd6\n", + "18:36:50 WARN riot :: [line: 3032155, col: 92] Bad IRI: Not a valid UUID string: uuid:BLAN-CB-B-a59c1a9c-e301-4f88-aff4-cf8f072d9cbc\n", + "18:36:50 WARN riot :: [line: 3044531, col: 92] Bad IRI: Not a valid UUID string: uuid:WSU1-CB-B-19e2cefa-49f2-499b-841d-33cc7ed8e6a7\n", + "18:36:50 WARN riot :: [line: 3090258, col: 92] Bad IRI: Not a valid UUID string: uuid:WY10-CB-B-87bbe689-217d-48d9-b6c0-7d4bf8f3540b\n", + "18:36:50 WARN riot :: [line: 3096825, col: 92] Bad IRI: Not a valid UUID string: uuid:NWBC-CB-T-1ab40003-8f10-4a2a-9ba1-911200388f64\n", + "18:36:50 INFO loader :: Add: 3,100,000 triples (Batch: 144,927 / Avg: 143,160)\n", + "18:36:50 WARN riot :: [line: 3111811, col: 92] Bad IRI: Not a valid UUID string: uuid:ISNC-CB-B-41c7672d-1b29-40e8-af49-137b03473cec\n", + "18:36:51 WARN riot :: [line: 3150508, col: 92] Bad IRI: Not a valid UUID string: uuid:TALL-CB-T-b47d748c-de50-4d2b-8bb3-85bcc20a231b\n", + "18:36:51 WARN riot :: [line: 3198278, col: 92] Bad IRI: Not a valid UUID string: uuid:SRER-CB-B-62d593bb-711a-4eb2-8359-403de43c9ae5\n", + "18:36:51 INFO loader :: Add: 3,200,000 triples (Batch: 154,083 / Avg: 143,478)\n", + "18:36:51 WARN riot :: [line: 3235560, col: 92] Bad IRI: Not a valid UUID string: uuid:UT19-CB-T-cc59032d-d8e6-4da3-95eb-df9b2ce3d7e3\n", + "18:36:52 INFO loader :: Add: 3,300,000 triples (Batch: 145,560 / Avg: 143,540)\n", + "18:36:52 WARN riot :: [line: 3342972, col: 92] Bad IRI: Not a valid UUID string: uuid:UKFS-CB-B-18631570-14bc-4401-80e0-85559c5eb037\n", + "18:36:52 WARN riot :: [line: 3358263, col: 92] Bad IRI: Not a valid UUID string: uuid:WOOD-CB-T-2db0e51c-90fa-481e-9432-61da1dfb5e15\n", + "18:36:52 WARN riot :: [line: 3382336, col: 92] Bad IRI: Not a valid UUID string: uuid:TREE-CB-T-03bc7e4f-a834-4d78-b733-e3d863ea640f\n", + "18:36:52 INFO loader :: Add: 3,400,000 triples (Batch: 145,137 / Avg: 143,587)\n", + "18:36:53 WARN riot :: [line: 3432388, col: 92] Bad IRI: Not a valid UUID string: uuid:WREF-CB-T-b7c5dd99-510e-403a-a79e-d45e0af78fb3\n", + "18:36:53 INFO loader :: Add: 3,500,000 triples (Batch: 146,842 / Avg: 143,678)\n", + "18:36:53 WARN riot :: [line: 3536909, col: 92] Bad IRI: Not a valid UUID string: uuid:UNDE-CB-T-f15c6148-9ebf-41d1-812b-7ba7c15d9d96\n", + "18:36:54 INFO loader :: Add: 3,600,000 triples (Batch: 145,560 / Avg: 143,729)\n", + "18:36:54 WARN riot :: [line: 3610058, col: 92] Bad IRI: Not a valid UUID string: uuid:UT23-CB-B-e76880f1-deec-4076-9e1a-1377fd94849a\n", + "18:36:54 WARN riot :: [line: 3624992, col: 92] Bad IRI: Not a valid UUID string: uuid:PETF-CB-B-9196012d-042a-47b4-986a-21ead1040e31\n", + "18:36:54 WARN riot :: [line: 3627424, col: 92] Bad IRI: Not a valid UUID string: uuid:CSF1-CB-T-8e2cab37-6db5-4377-8eab-293a58400041\n", + "18:36:54 INFO loader :: Add: 3,700,000 triples (Batch: 143,678 / Avg: 143,728)\n", + "18:36:55 WARN riot :: [line: 3714832, col: 92] Bad IRI: Not a valid UUID string: uuid:SRR1-CB-T-be47ce48-f9bc-4898-8b81-199ace9dc65f\n", + "18:36:55 WARN riot :: [line: 3729599, col: 92] Bad IRI: Not a valid UUID string: uuid:SRR1-CB-B-c7d1d4e9-aa53-4c64-88c7-8f8ed276a613\n", + "18:36:55 WARN riot :: [line: 3752249, col: 92] Bad IRI: Not a valid UUID string: uuid:OCTU-CB-B-c486e8fb-c960-4f2a-97a7-9bb87a1c7147\n", + "18:36:55 INFO loader :: Add: 3,800,000 triples (Batch: 144,300 / Avg: 143,743)\n", + "18:36:56 WARN riot :: [line: 3869379, col: 92] Bad IRI: Not a valid UUID string: uuid:HARV-CB-B-c2af199e-fe66-41fc-93a9-50166b4c6bb5\n", + "18:36:56 WARN riot :: [line: 3886553, col: 92] Bad IRI: Not a valid UUID string: uuid:WLLO-CB-B-68043fc5-8096-41a1-9df6-b67edd414c4f\n", + "18:36:56 INFO loader :: Add: 3,900,000 triples (Batch: 153,609 / Avg: 143,980)\n", + "18:36:56 WARN riot :: [line: 3925469, col: 92] Bad IRI: Not a valid UUID string: uuid:SRER-CB-T-534f50bf-ef66-4ed1-90f1-533ef4b52528\n", + "18:36:56 WARN riot :: [line: 3925512, col: 92] Bad IRI: Not a valid UUID string: uuid:DSNY-CB-T-16477917-d489-45ca-a926-5acc7eaca071\n", + "18:36:56 WARN riot :: [line: 3937346, col: 92] Bad IRI: Not a valid UUID string: uuid:UKFS-CB-T-9a091afa-d16e-48de-b98c-82521e3a95fa\n", + "18:36:57 INFO loader :: Add: 4,000,000 triples (Batch: 143,266 / Avg: 143,962)\n", + "18:36:57 INFO loader :: Elapsed: 27.79 seconds [2024/03/11 18:36:57 UTC]\n", + "18:36:57 INFO loader :: Add: 4,100,000 triples (Batch: 145,560 / Avg: 144,001)\n", + "18:36:58 WARN riot :: [line: 4192285, col: 92] Bad IRI: Not a valid UUID string: uuid:PPRH-CB-T-a18a96d5-cf4b-4707-ab31-c605aeddbfdf\n", + "18:36:58 INFO loader :: Add: 4,200,000 triples (Batch: 154,320 / Avg: 144,230)\n", + "18:36:58 WARN riot :: [line: 4221923, col: 92] Bad IRI: Not a valid UUID string: uuid:WSU1-CB-T-90f166a5-20ce-4c9b-b4a4-4a587395ffae\n", + "18:36:58 WARN riot :: [line: 4228711, col: 92] Bad IRI: Not a valid UUID string: uuid:TALL-CB-B-306ac187-5bcc-436a-9078-e0de4df5ad2e\n", + "18:36:58 WARN riot :: [line: 4273110, col: 92] Bad IRI: Not a valid UUID string: uuid:UT12-CB-T-97b626b3-a280-4e40-8212-282db79cbc69\n", + "18:36:59 WARN riot :: [line: 4297981, col: 92] Bad IRI: Not a valid UUID string: uuid:NWBB-CB-T-3af18d1b-014b-4e70-8d96-6920244ddb79\n", + "18:36:59 INFO loader :: Add: 4,300,000 triples (Batch: 144,927 / Avg: 144,246)\n", + "18:36:59 WARN riot :: [line: 4360646, col: 92] Bad IRI: Not a valid UUID string: uuid:NOGP-CB-B-c51d20a5-51d1-4b16-92cc-d885700656c3\n", + "18:36:59 WARN riot :: [line: 4381650, col: 92] Bad IRI: Not a valid UUID string: uuid:UT23-CB-T-c70411c3-0a2f-43ae-a58d-204fb8c48c46\n", + "18:36:59 WARN riot :: [line: 4393103, col: 92] Bad IRI: Not a valid UUID string: uuid:JERC-CB-B-cfba0afb-7a48-4edc-93f8-1e89b3025c3d\n", + "18:36:59 INFO loader :: Add: 4,400,000 triples (Batch: 145,560 / Avg: 144,276)\n", + "18:36:59 WARN riot :: [line: 4414710, col: 92] Bad IRI: Not a valid UUID string: uuid:PSR2-CB-T-177b015d-e006-4aca-977d-c9c118f6ec69\n", + "18:36:59 WARN riot :: [line: 4419926, col: 92] Bad IRI: Not a valid UUID string: uuid:MLSB-CB-B-2d4aa025-e90e-4ce4-b3b5-fd7ff2bdf80b\n", + "18:37:00 WARN riot :: [line: 4440958, col: 92] Bad IRI: Not a valid UUID string: uuid:UT32-CB-T-38e515a2-ddec-41e0-be29-4fce953451fa\n", + "18:37:00 WARN riot :: [line: 4483914, col: 92] Bad IRI: Not a valid UUID string: uuid:FTA3-CB-B-6c80279c-0ca3-4835-bfea-501e43fdc1a6\n", + "18:37:00 INFO loader :: Add: 4,500,000 triples (Batch: 142,045 / Avg: 144,226)\n", + "18:37:00 WARN riot :: [line: 4513243, col: 92] Bad IRI: Not a valid UUID string: uuid:FTA5-CB-T-6930cad4-38f4-4eab-8b53-45805fbcc40d\n", + "18:37:00 WARN riot :: [line: 4559759, col: 92] Bad IRI: Not a valid UUID string: uuid:MOAB-CB-T-40bbee20-6cee-4430-bc25-2a5f21ce31fb\n", + "18:37:01 INFO loader :: Add: 4,600,000 triples (Batch: 156,250 / Avg: 144,467)\n", + "18:37:01 WARN riot :: [line: 4612252, col: 92] Bad IRI: Not a valid UUID string: uuid:OSBS-CB-T-f2be224e-7356-4cd0-b585-4f408c30b59f\n", + "18:37:01 WARN riot :: [line: 4697154, col: 92] Bad IRI: Not a valid UUID string: uuid:BLAN-CB-T-298f4fa4-a6ae-4cd1-8ac4-ed7b239a6abf\n", + "18:37:01 INFO loader :: Add: 4,700,000 triples (Batch: 148,148 / Avg: 144,544)\n", + "18:37:01 WARN riot :: [line: 4703778, col: 92] Bad IRI: Not a valid UUID string: uuid:STEI-CB-T-73e26647-1f1e-43cc-84b0-2fd658886f7d\n", + "18:37:02 INFO loader :: Add: 4,800,000 triples (Batch: 145,772 / Avg: 144,569)\n", + "18:37:02 WARN riot :: [line: 4863602, col: 92] Bad IRI: Not a valid UUID string: uuid:WSU3-CB-T-4aaf3e2e-30af-4cb3-bfb3-e4cbeae25f62\n", + "18:37:03 INFO loader :: Add: 4,900,000 triples (Batch: 157,232 / Avg: 144,807)\n", + "18:37:03 WARN riot :: [line: 4945591, col: 92] Bad IRI: Not a valid UUID string: uuid:WY03-CB-B-37aa5072-bb2c-40da-a03f-cbd4acb0f135\n", + "18:37:03 WARN riot :: [line: 4986491, col: 92] Bad IRI: Not a valid UUID string: uuid:CSF1-CB-B-1171add7-ad79-4c39-9e8f-977fdf0fc9e8\n", + "18:37:03 INFO loader :: Add: 5,000,000 triples (Batch: 147,710 / Avg: 144,864)\n", + "18:37:03 INFO loader :: Elapsed: 34.52 seconds [2024/03/11 18:37:03 UTC]\n", + "18:37:03 WARN riot :: [line: 5000989, col: 92] Bad IRI: Not a valid UUID string: uuid:WLLO-CB-T-ddc7635d-24f0-4456-8f9f-3680b08db779\n", + "18:37:03 WARN riot :: [line: 5034335, col: 92] Bad IRI: Not a valid UUID string: uuid:ISCC-CB-T-5c5875b4-abd6-4226-b0ab-fdc9637ebb98\n", + "18:37:04 INFO loader :: Add: 5,100,000 triples (Batch: 145,560 / Avg: 144,878)\n", + "18:37:04 WARN riot :: [line: 5141558, col: 92] Bad IRI: Not a valid UUID string: uuid:WLUP-CB-T-a07898a8-bff9-4bc2-ae4c-3456825b1a81\n", + "18:37:04 WARN riot :: [line: 5166507, col: 92] Bad IRI: Not a valid UUID string: uuid:CPER-CB-T-b8027938-7ff4-46c1-8575-e23584f1e898\n", + "18:37:05 INFO loader :: Add: 5,200,000 triples (Batch: 134,048 / Avg: 144,653)\n", + "18:37:05 WARN riot :: [line: 5206584, col: 92] Bad IRI: Not a valid UUID string: uuid:DELA-CB-B-dc7411ef-453e-4b91-9885-a69fce891f8b\n", + "18:37:05 WARN riot :: [line: 5236916, col: 92] Bad IRI: Not a valid UUID string: uuid:JERC-CB-T-f3a0b7b4-b141-4080-9d42-394e5ba1c35c\n", + "18:37:05 INFO loader :: Add: 5,300,000 triples (Batch: 145,560 / Avg: 144,670)\n", + "18:37:06 WARN riot :: [line: 5328212, col: 92] Bad IRI: Not a valid UUID string: uuid:PHTU-CB-T-3c3d8153-072e-4f66-b7f8-a7c2f9fed5df\n", + "18:37:06 WARN riot :: [line: 5376098, col: 92] Bad IRI: Not a valid UUID string: uuid:CLBJ-CB-T-1f0d1871-90b7-445d-906d-71685a8ccae2\n", + "18:37:06 INFO loader :: Add: 5,400,000 triples (Batch: 150,602 / Avg: 144,775)\n", + "18:37:06 WARN riot :: [line: 5452752, col: 92] Bad IRI: Not a valid UUID string: uuid:WY09-CB-B-cfcfdb9b-5d73-41ea-b105-f3fbe1d56c2b\n", + "18:37:06 WARN riot :: [line: 5453537, col: 92] Bad IRI: Not a valid UUID string: uuid:DCFS-CB-B-2357f0eb-9b01-4316-b676-dbc444bbef1d\n", + "18:37:07 WARN riot :: [line: 5478714, col: 92] Bad IRI: Not a valid UUID string: uuid:WSU2-CB-T-f0728f3d-65c8-4075-9d17-47ce2e1de63c\n", + "18:37:07 INFO loader :: Add: 5,500,000 triples (Batch: 144,092 / Avg: 144,763)\n", + "18:37:07 WARN riot :: [line: 5533015, col: 92] Bad IRI: Not a valid UUID string: uuid:ANZA-CB-B-6bc68e7d-7383-4bbe-9789-2f897ab426ca\n", + "18:37:07 WARN riot :: [line: 5549418, col: 92] Bad IRI: Not a valid UUID string: uuid:LENO-CB-B-26b5381f-14b1-401e-a729-4e6d1b421c37\n", + "18:37:07 INFO loader :: Add: 5,600,000 triples (Batch: 158,227 / Avg: 144,983)\n", + "18:37:08 WARN riot :: [line: 5637300, col: 92] Bad IRI: Not a valid UUID string: uuid:DSNY-CB-B-acabb4de-4e97-4820-8d28-07a5b48cc286\n", + "18:37:08 INFO loader :: Add: 5,700,000 triples (Batch: 151,975 / Avg: 145,100)\n", + "18:37:08 WARN riot :: [line: 5752691, col: 92] Bad IRI: Not a valid UUID string: uuid:WY10-CB-T-b6ac5210-79d5-4142-8756-4babab75fca0\n", + "18:37:08 WARN riot :: [line: 5770196, col: 92] Bad IRI: Not a valid UUID string: uuid:FTA5-CB-B-f450aea0-e6fb-4596-9f8b-56dba4635a8b\n", + "18:37:09 INFO loader :: Add: 5,800,000 triples (Batch: 153,846 / Avg: 145,243)\n", + "18:37:09 WARN riot :: [line: 5834345, col: 92] Bad IRI: Not a valid UUID string: uuid:ISCC-CB-B-e95d2dcd-dd4f-42b7-9345-730dbfc2aa53\n", + "18:37:09 INFO loader :: Add: 5,900,000 triples (Batch: 156,739 / Avg: 145,424)\n", + "18:37:10 INFO loader :: Add: 6,000,000 triples (Batch: 153,374 / Avg: 145,549)\n", + "18:37:10 INFO loader :: Elapsed: 41.22 seconds [2024/03/11 18:37:10 UTC]\n", + "18:37:11 INFO loader :: Add: 6,100,000 triples (Batch: 150,375 / Avg: 145,626)\n", + "18:37:11 WARN riot :: [line: 6167981, col: 92] Bad IRI: Not a valid UUID string: uuid:KONA-CB-T-326ddf51-8af8-4d02-b946-c561ab741cae\n", + "18:37:11 INFO loader :: Add: 6,200,000 triples (Batch: 151,515 / Avg: 145,717)\n", + "18:37:11 WARN riot :: [line: 6231615, col: 92] Bad IRI: Not a valid UUID string: uuid:NOGP-CB-T-8efdeada-35cc-48e2-a3d2-ea0ac76b142b\n", + "18:37:12 INFO loader :: Add: 6,300,000 triples (Batch: 159,235 / Avg: 145,914)\n", + "18:37:12 WARN riot :: [line: 6305813, col: 92] Bad IRI: Not a valid UUID string: uuid:CLBJ-CB-B-dfc9535a-74f1-44dd-8aad-996b1fa20111\n", + "18:37:12 WARN riot :: [line: 6317298, col: 92] Bad IRI: Not a valid UUID string: uuid:WLUP-CB-B-24cc2fd3-3d49-4d66-9edb-08cc7fce1450\n", + "18:37:13 INFO loader :: Add: 6,400,000 triples (Batch: 137,741 / Avg: 145,779)\n", + "18:37:13 WARN riot :: [line: 6446752, col: 92] Bad IRI: Not a valid UUID string: uuid:WY15-CB-T-44303230-0b7b-4367-b3e3-22c9f70ddbce\n", + "18:37:13 INFO loader :: Add: 6,500,000 triples (Batch: 147,928 / Avg: 145,811)\n", + "18:37:13 WARN riot :: [line: 6516614, col: 92] Bad IRI: Not a valid UUID string: uuid:WY09-CB-T-40d5b98a-b736-4509-9f9f-24d39a3e338b\n", + "18:37:14 WARN riot :: [line: 6544773, col: 92] Bad IRI: Not a valid UUID string: uuid:HARV-CB-T-a1fbe98a-a761-4a91-b8fc-67c4e0d5d292\n", + "18:37:14 WARN riot :: [line: 6546113, col: 92] Bad IRI: Not a valid UUID string: uuid:PSR1-CB-T-b15f1f00-c26a-47d1-969f-91faf5944d58\n", + "18:37:14 WARN riot :: [line: 6546716, col: 92] Bad IRI: Not a valid UUID string: uuid:OCTU-CB-T-9991b6ee-c1d0-4a65-b20e-43bafd3fa050\n", + "18:37:14 WARN riot :: [line: 6595135, col: 92] Bad IRI: Not a valid UUID string: uuid:OAES-CB-T-f7a2632b-871d-4276-92c5-d8e3453d0221\n", + "18:37:14 INFO loader :: Add: 6,600,000 triples (Batch: 139,082 / Avg: 145,705)\n", + "18:37:14 WARN riot :: [line: 6616914, col: 92] Bad IRI: Not a valid UUID string: uuid:SCBI-CB-T-e76a2f85-b6a1-4480-9c67-5ca8390b8399\n", + "18:37:14 WARN riot :: [line: 6629596, col: 92] Bad IRI: Not a valid UUID string: uuid:ONAQ-CB-B-d38e0cda-aca4-4bfa-848b-c7f8ad2cc65f\n", + "18:37:15 INFO loader :: Add: 6,700,000 triples (Batch: 134,228 / Avg: 145,519)\n", + "18:37:15 INFO loader :: -- Finish triples data phase\n", + "18:37:15 INFO loader :: ** Data: 6,784,535 triples loaded in 46.68 seconds [Rate: 145,353.83 per second]\n", + "18:37:15 INFO loader :: -- Finish quads data phase\n", + "18:37:15 INFO loader :: -- Start triples index phase\n", + "18:37:15 INFO loader :: Index SPO->POS: 100,000 slots (Batch: 1,408,450 slots/s / Avg: 1,408,450 slots/s)\n", + "18:37:16 INFO loader :: Index SPO->POS: 200,000 slots (Batch: 1,298,701 slots/s / Avg: 1,351,351 slots/s)\n", + "18:37:16 INFO loader :: Index SPO->POS: 300,000 slots (Batch: 934,579 slots/s / Avg: 1,176,470 slots/s)\n", + "18:37:16 INFO loader :: Index SPO->POS: 400,000 slots (Batch: 1,219,512 slots/s / Avg: 1,186,943 slots/s)\n", + "18:37:16 INFO loader :: Index SPO->POS: 500,000 slots (Batch: 1,190,476 slots/s / Avg: 1,187,648 slots/s)\n", + "18:37:16 INFO loader :: Index SPO->POS: 600,000 slots (Batch: 909,090 slots/s / Avg: 1,129,943 slots/s)\n", + "18:37:16 INFO loader :: Index SPO->POS: 700,000 slots (Batch: 1,136,363 slots/s / Avg: 1,130,856 slots/s)\n", + "18:37:16 INFO loader :: Index SPO->POS: 800,000 slots (Batch: 1,162,790 slots/s / Avg: 1,134,751 slots/s)\n", + "18:37:16 INFO loader :: Index SPO->POS: 900,000 slots (Batch: 925,925 slots/s / Avg: 1,107,011 slots/s)\n", + "18:37:16 INFO loader :: Index SPO->POS: 1,000,000 slots (Batch: 1,123,595 slots/s / Avg: 1,108,647 slots/s)\n", + "18:37:16 INFO loader :: Elapsed: 47.58 seconds [2024/03/11 18:37:16 UTC]\n", + "18:37:16 INFO loader :: Index SPO->POS: 1,100,000 slots (Batch: 1,162,790 slots/s / Avg: 1,113,360 slots/s)\n", + "18:37:17 INFO loader :: Index SPO->POS: 1,200,000 slots (Batch: 1,123,595 slots/s / Avg: 1,114,206 slots/s)\n", + "18:37:17 INFO loader :: Index SPO->POS: 1,300,000 slots (Batch: 862,068 slots/s / Avg: 1,089,689 slots/s)\n", + "18:37:17 INFO loader :: Index SPO->POS: 1,400,000 slots (Batch: 1,111,111 slots/s / Avg: 1,091,192 slots/s)\n", + "18:37:17 INFO loader :: Index SPO->POS: 1,500,000 slots (Batch: 1,149,425 slots/s / Avg: 1,094,890 slots/s)\n", + "18:37:17 INFO loader :: Index SPO->POS: 1,600,000 slots (Batch: 781,250 slots/s / Avg: 1,068,090 slots/s)\n", + "18:37:17 INFO loader :: Index SPO->POS: 1,700,000 slots (Batch: 884,955 slots/s / Avg: 1,055,245 slots/s)\n", + "18:37:17 INFO loader :: Index SPO->POS: 1,800,000 slots (Batch: 909,090 slots/s / Avg: 1,045,903 slots/s)\n", + "18:37:17 INFO loader :: Index SPO->POS: 1,900,000 slots (Batch: 892,857 slots/s / Avg: 1,036,552 slots/s)\n", + "18:37:17 INFO loader :: Index SPO->POS: 2,000,000 slots (Batch: 746,268 slots/s / Avg: 1,016,776 slots/s)\n", + "18:37:17 INFO loader :: Elapsed: 48.65 seconds [2024/03/11 18:37:17 UTC]\n", + "18:37:18 INFO loader :: Index SPO->POS: 2,100,000 slots (Batch: 869,565 slots/s / Avg: 1,008,645 slots/s)\n", + "18:37:18 INFO loader :: Index SPO->POS: 2,200,000 slots (Batch: 847,457 slots/s / Avg: 1,000,000 slots/s)\n", + "18:37:18 INFO loader :: Index SPO->POS: 2,300,000 slots (Batch: 884,955 slots/s / Avg: 994,379 slots/s)\n", + "18:37:18 INFO loader :: Index SPO->POS: 2,400,000 slots (Batch: 709,219 slots/s / Avg: 977,995 slots/s)\n", + "18:37:18 INFO loader :: Index SPO->POS: 2,500,000 slots (Batch: 862,068 slots/s / Avg: 972,762 slots/s)\n", + "18:37:18 INFO loader :: Index SPO->POS: 2,600,000 slots (Batch: 884,955 slots/s / Avg: 969,064 slots/s)\n", + "18:37:18 INFO loader :: Index SPO->POS: 2,700,000 slots (Batch: 869,565 slots/s / Avg: 964,974 slots/s)\n", + "18:37:18 INFO loader :: Index SPO->POS: 2,800,000 slots (Batch: 724,637 slots/s / Avg: 953,678 slots/s)\n", + "18:37:18 INFO loader :: Index SPO->POS: 2,900,000 slots (Batch: 877,192 slots/s / Avg: 950,819 slots/s)\n", + "18:37:19 INFO loader :: Index SPO->POS: 3,000,000 slots (Batch: 869,565 slots/s / Avg: 947,867 slots/s)\n", + "18:37:19 INFO loader :: Elapsed: 49.85 seconds [2024/03/11 18:37:19 UTC]\n", + "18:37:19 INFO loader :: Index SPO->POS: 3,100,000 slots (Batch: 724,637 slots/s / Avg: 938,540 slots/s)\n", + "18:37:19 INFO loader :: Index SPO->POS: 3,200,000 slots (Batch: 854,700 slots/s / Avg: 935,672 slots/s)\n", + "18:37:19 INFO loader :: Index SPO->POS: 3,300,000 slots (Batch: 869,565 slots/s / Avg: 933,521 slots/s)\n", + "18:37:19 INFO loader :: Index SPO->POS: 3,400,000 slots (Batch: 847,457 slots/s / Avg: 930,741 slots/s)\n", + "18:37:19 INFO loader :: Index SPO->POS: 3,500,000 slots (Batch: 729,927 slots/s / Avg: 923,482 slots/s)\n", + "18:37:19 INFO loader :: Index SPO->POS: 3,600,000 slots (Batch: 877,192 slots/s / Avg: 922,131 slots/s)\n", + "18:37:19 INFO loader :: Index SPO->POS: 3,700,000 slots (Batch: 869,565 slots/s / Avg: 920,627 slots/s)\n", + "18:37:20 INFO loader :: Index SPO->POS: 3,800,000 slots (Batch: 847,457 slots/s / Avg: 918,540 slots/s)\n", + "18:37:20 INFO loader :: Index SPO->POS: 3,900,000 slots (Batch: 740,740 slots/s / Avg: 912,921 slots/s)\n", + "18:37:20 INFO loader :: Index SPO->POS: 4,000,000 slots (Batch: 833,333 slots/s / Avg: 910,746 slots/s)\n", + "18:37:20 INFO loader :: Elapsed: 51.07 seconds [2024/03/11 18:37:20 UTC]\n", + "18:37:20 INFO loader :: Index SPO->POS: 4,100,000 slots (Batch: 826,446 slots/s / Avg: 908,486 slots/s)\n", + "18:37:20 INFO loader :: Index SPO->POS: 4,200,000 slots (Batch: 826,446 slots/s / Avg: 906,344 slots/s)\n", + "18:37:20 INFO loader :: Index SPO->POS: 4,300,000 slots (Batch: 740,740 slots/s / Avg: 901,656 slots/s)\n", + "18:37:20 INFO loader :: Index SPO->POS: 4,400,000 slots (Batch: 877,192 slots/s / Avg: 901,085 slots/s)\n", + "18:37:20 INFO loader :: Index SPO->POS: 4,500,000 slots (Batch: 806,451 slots/s / Avg: 898,741 slots/s)\n", + "18:37:21 INFO loader :: Index SPO->POS: 4,600,000 slots (Batch: 800,000 slots/s / Avg: 896,336 slots/s)\n", + "18:37:21 INFO loader :: Index SPO->POS: 4,700,000 slots (Batch: 781,250 slots/s / Avg: 893,536 slots/s)\n", + "18:37:21 INFO loader :: Index SPO->POS: 4,800,000 slots (Batch: 917,431 slots/s / Avg: 894,021 slots/s)\n", + "18:37:21 INFO loader :: Index SPO->POS: 4,900,000 slots (Batch: 917,431 slots/s / Avg: 894,487 slots/s)\n", + "18:37:21 INFO loader :: Index SPO->POS: 5,000,000 slots (Batch: 869,565 slots/s / Avg: 893,974 slots/s)\n", + "18:37:21 INFO loader :: Elapsed: 52.27 seconds [2024/03/11 18:37:21 UTC]\n", + "18:37:21 INFO loader :: Index SPO->POS: 5,100,000 slots (Batch: 787,401 slots/s / Avg: 891,608 slots/s)\n", + "18:37:21 INFO loader :: Index SPO->POS: 5,200,000 slots (Batch: 917,431 slots/s / Avg: 892,091 slots/s)\n", + "18:37:21 INFO loader :: Index SPO->POS: 5,300,000 slots (Batch: 884,955 slots/s / Avg: 891,955 slots/s)\n", + "18:37:21 INFO loader :: Index SPO->POS: 5,400,000 slots (Batch: 847,457 slots/s / Avg: 891,089 slots/s)\n", + "18:37:22 INFO loader :: Index SPO->POS: 5,500,000 slots (Batch: 793,650 slots/s / Avg: 889,104 slots/s)\n", + "18:37:22 INFO loader :: Index SPO->POS: 5,600,000 slots (Batch: 884,955 slots/s / Avg: 889,030 slots/s)\n", + "18:37:22 INFO loader :: Index SPO->POS: 5,700,000 slots (Batch: 900,900 slots/s / Avg: 889,235 slots/s)\n", + "18:37:22 INFO loader :: Index SPO->POS: 5,800,000 slots (Batch: 769,230 slots/s / Avg: 886,850 slots/s)\n", + "18:37:22 INFO loader :: Index SPO->POS: 5,900,000 slots (Batch: 724,637 slots/s / Avg: 883,498 slots/s)\n", + "18:37:22 INFO loader :: Index SPO->POS: 6,000,000 slots (Batch: 746,268 slots/s / Avg: 880,798 slots/s)\n", + "18:37:22 INFO loader :: Elapsed: 53.49 seconds [2024/03/11 18:37:22 UTC]\n", + "18:37:22 INFO loader :: Index SPO->POS: 6,100,000 slots (Batch: 757,575 slots/s / Avg: 878,456 slots/s)\n", + "18:37:23 INFO loader :: Index SPO->POS: 6,200,000 slots (Batch: 757,575 slots/s / Avg: 876,201 slots/s)\n", + "18:37:23 INFO loader :: Index SPO->POS: 6,300,000 slots (Batch: 746,268 slots/s / Avg: 873,786 slots/s)\n", + "18:37:23 INFO loader :: Index SPO->POS: 6,400,000 slots (Batch: 735,294 slots/s / Avg: 871,222 slots/s)\n", + "18:37:23 INFO loader :: Index SPO->POS: 6,500,000 slots (Batch: 735,294 slots/s / Avg: 868,751 slots/s)\n", + "18:37:23 INFO loader :: Index SPO->POS: 6,600,000 slots (Batch: 751,879 slots/s / Avg: 866,710 slots/s)\n", + "18:37:23 INFO loader :: Index SPO->POS: 6,700,000 slots (Batch: 769,230 slots/s / Avg: 865,074 slots/s)\n", + "18:37:23 INFO loader :: ** Index SPO->POS: 6,784,535 slots indexed in 7.85 seconds [Rate: 864,051.81 per second]\n", + "18:37:23 INFO loader :: Index SPO->OSP: 100,000 slots (Batch: 1,960,784 slots/s / Avg: 1,960,784 slots/s)\n", + "18:37:23 INFO loader :: Index SPO->OSP: 200,000 slots (Batch: 1,639,344 slots/s / Avg: 1,785,714 slots/s)\n", + "18:37:23 INFO loader :: Index SPO->OSP: 300,000 slots (Batch: 1,587,301 slots/s / Avg: 1,714,285 slots/s)\n", + "18:37:24 INFO loader :: Index SPO->OSP: 400,000 slots (Batch: 1,449,275 slots/s / Avg: 1,639,344 slots/s)\n", + "18:37:24 INFO loader :: Index SPO->OSP: 500,000 slots (Batch: 1,408,450 slots/s / Avg: 1,587,301 slots/s)\n", + "18:37:24 INFO loader :: Index SPO->OSP: 600,000 slots (Batch: 1,388,888 slots/s / Avg: 1,550,387 slots/s)\n", + "18:37:24 INFO loader :: Index SPO->OSP: 700,000 slots (Batch: 1,282,051 slots/s / Avg: 1,505,376 slots/s)\n", + "18:37:24 INFO loader :: Index SPO->OSP: 800,000 slots (Batch: 1,204,819 slots/s / Avg: 1,459,854 slots/s)\n", + "18:37:24 INFO loader :: Index SPO->OSP: 900,000 slots (Batch: 1,123,595 slots/s / Avg: 1,412,872 slots/s)\n", + "18:37:24 INFO loader :: Index SPO->OSP: 1,000,000 slots (Batch: 1,075,268 slots/s / Avg: 1,369,863 slots/s)\n", + "18:37:24 INFO loader :: Elapsed: 55.26 seconds [2024/03/11 18:37:24 UTC]\n", + "18:37:24 INFO loader :: Index SPO->OSP: 1,100,000 slots (Batch: 1,111,111 slots/s / Avg: 1,341,463 slots/s)\n", + "18:37:24 INFO loader :: Index SPO->OSP: 1,200,000 slots (Batch: 1,075,268 slots/s / Avg: 1,314,348 slots/s)\n", + "18:37:24 INFO loader :: Index SPO->OSP: 1,300,000 slots (Batch: 1,123,595 slots/s / Avg: 1,297,405 slots/s)\n", + "18:37:24 INFO loader :: Index SPO->OSP: 1,400,000 slots (Batch: 1,020,408 slots/s / Avg: 1,272,727 slots/s)\n", + "18:37:24 INFO loader :: Index SPO->OSP: 1,500,000 slots (Batch: 1,052,631 slots/s / Avg: 1,255,230 slots/s)\n", + "18:37:25 INFO loader :: Index SPO->OSP: 1,600,000 slots (Batch: 1,020,408 slots/s / Avg: 1,237,432 slots/s)\n", + "18:37:25 INFO loader :: Index SPO->OSP: 1,700,000 slots (Batch: 990,099 slots/s / Avg: 1,219,512 slots/s)\n", + "18:37:25 INFO loader :: Index SPO->OSP: 1,800,000 slots (Batch: 1,063,829 slots/s / Avg: 1,209,677 slots/s)\n", + "18:37:25 INFO loader :: Index SPO->OSP: 1,900,000 slots (Batch: 1,041,666 slots/s / Avg: 1,199,494 slots/s)\n", + "18:37:25 INFO loader :: Index SPO->OSP: 2,000,000 slots (Batch: 1,075,268 slots/s / Avg: 1,192,605 slots/s)\n", + "18:37:25 INFO loader :: Elapsed: 56.21 seconds [2024/03/11 18:37:25 UTC]\n", + "18:37:25 INFO loader :: Index SPO->OSP: 2,100,000 slots (Batch: 1,030,927 slots/s / Avg: 1,183,765 slots/s)\n", + "18:37:25 INFO loader :: Index SPO->OSP: 2,200,000 slots (Batch: 1,052,631 slots/s / Avg: 1,177,100 slots/s)\n", + "18:37:25 INFO loader :: Index SPO->OSP: 2,300,000 slots (Batch: 1,030,927 slots/s / Avg: 1,169,888 slots/s)\n", + "18:37:25 INFO loader :: Index SPO->OSP: 2,400,000 slots (Batch: 1,075,268 slots/s / Avg: 1,165,614 slots/s)\n", + "18:37:25 INFO loader :: Index SPO->OSP: 2,500,000 slots (Batch: 980,392 slots/s / Avg: 1,156,871 slots/s)\n", + "18:37:26 INFO loader :: Index SPO->OSP: 2,600,000 slots (Batch: 1,041,666 slots/s / Avg: 1,151,971 slots/s)\n", + "18:37:26 INFO loader :: Index SPO->OSP: 2,700,000 slots (Batch: 970,873 slots/s / Avg: 1,144,067 slots/s)\n", + "18:37:26 INFO loader :: Index SPO->OSP: 2,800,000 slots (Batch: 1,063,829 slots/s / Avg: 1,140,994 slots/s)\n", + "18:37:26 INFO loader :: Index SPO->OSP: 2,900,000 slots (Batch: 1,000,000 slots/s / Avg: 1,135,473 slots/s)\n", + "18:37:26 INFO loader :: Index SPO->OSP: 3,000,000 slots (Batch: 952,380 slots/s / Avg: 1,128,243 slots/s)\n", + "18:37:26 INFO loader :: Elapsed: 57.19 seconds [2024/03/11 18:37:26 UTC]\n", + "18:37:26 INFO loader :: Index SPO->OSP: 3,100,000 slots (Batch: 925,925 slots/s / Avg: 1,120,346 slots/s)\n", + "18:37:26 INFO loader :: Index SPO->OSP: 3,200,000 slots (Batch: 917,431 slots/s / Avg: 1,112,656 slots/s)\n", + "18:37:26 INFO loader :: Index SPO->OSP: 3,300,000 slots (Batch: 909,090 slots/s / Avg: 1,105,157 slots/s)\n", + "18:37:26 INFO loader :: Index SPO->OSP: 3,400,000 slots (Batch: 934,579 slots/s / Avg: 1,099,256 slots/s)\n", + "18:37:26 INFO loader :: Index SPO->OSP: 3,500,000 slots (Batch: 952,380 slots/s / Avg: 1,094,434 slots/s)\n", + "18:37:27 INFO loader :: Index SPO->OSP: 3,600,000 slots (Batch: 934,579 slots/s / Avg: 1,089,258 slots/s)\n", + "18:37:27 INFO loader :: Index SPO->OSP: 3,700,000 slots (Batch: 1,041,666 slots/s / Avg: 1,087,915 slots/s)\n", + "18:37:27 INFO loader :: Index SPO->OSP: 3,800,000 slots (Batch: 1,075,268 slots/s / Avg: 1,087,578 slots/s)\n", + "18:37:27 INFO loader :: Index SPO->OSP: 3,900,000 slots (Batch: 1,010,101 slots/s / Avg: 1,085,443 slots/s)\n", + "18:37:27 INFO loader :: Index SPO->OSP: 4,000,000 slots (Batch: 1,020,408 slots/s / Avg: 1,083,717 slots/s)\n", + "18:37:27 INFO loader :: Elapsed: 58.22 seconds [2024/03/11 18:37:27 UTC]\n", + "18:37:27 INFO loader :: Index SPO->OSP: 4,100,000 slots (Batch: 970,873 slots/s / Avg: 1,080,653 slots/s)\n", + "18:37:27 INFO loader :: Index SPO->OSP: 4,200,000 slots (Batch: 1,000,000 slots/s / Avg: 1,078,582 slots/s)\n", + "18:37:27 INFO loader :: Index SPO->OSP: 4,300,000 slots (Batch: 1,052,631 slots/s / Avg: 1,077,964 slots/s)\n", + "18:37:27 INFO loader :: Index SPO->OSP: 4,400,000 slots (Batch: 1,020,408 slots/s / Avg: 1,076,584 slots/s)\n", + "18:37:27 INFO loader :: Index SPO->OSP: 4,500,000 slots (Batch: 1,000,000 slots/s / Avg: 1,074,755 slots/s)\n", + "18:37:28 INFO loader :: Index SPO->OSP: 4,600,000 slots (Batch: 1,010,101 slots/s / Avg: 1,073,261 slots/s)\n", + "18:37:28 INFO loader :: Index SPO->OSP: 4,700,000 slots (Batch: 952,380 slots/s / Avg: 1,070,371 slots/s)\n", + "18:37:28 INFO loader :: Index SPO->OSP: 4,800,000 slots (Batch: 1,000,000 slots/s / Avg: 1,068,804 slots/s)\n", + "18:37:28 INFO loader :: Index SPO->OSP: 4,900,000 slots (Batch: 1,020,408 slots/s / Avg: 1,067,770 slots/s)\n", + "18:37:28 INFO loader :: Index SPO->OSP: 5,000,000 slots (Batch: 1,041,666 slots/s / Avg: 1,067,235 slots/s)\n", + "18:37:28 INFO loader :: Elapsed: 59.22 seconds [2024/03/11 18:37:28 UTC]\n", + "18:37:28 INFO loader :: Index SPO->OSP: 5,100,000 slots (Batch: 970,873 slots/s / Avg: 1,065,162 slots/s)\n", + "18:37:28 INFO loader :: Index SPO->OSP: 5,200,000 slots (Batch: 1,010,101 slots/s / Avg: 1,064,047 slots/s)\n", + "18:37:28 INFO loader :: Index SPO->OSP: 5,300,000 slots (Batch: 1,010,101 slots/s / Avg: 1,062,976 slots/s)\n", + "18:37:28 INFO loader :: Index SPO->OSP: 5,400,000 slots (Batch: 1,010,101 slots/s / Avg: 1,061,946 slots/s)\n", + "18:37:28 INFO loader :: Index SPO->OSP: 5,500,000 slots (Batch: 970,873 slots/s / Avg: 1,060,138 slots/s)\n", + "18:37:29 INFO loader :: Index SPO->OSP: 5,600,000 slots (Batch: 970,873 slots/s / Avg: 1,058,401 slots/s)\n", + "18:37:29 INFO loader :: Index SPO->OSP: 5,700,000 slots (Batch: 952,380 slots/s / Avg: 1,056,338 slots/s)\n", + "18:37:29 INFO loader :: Index SPO->OSP: 5,800,000 slots (Batch: 980,392 slots/s / Avg: 1,054,929 slots/s)\n", + "18:37:29 INFO loader :: Index SPO->OSP: 5,900,000 slots (Batch: 869,565 slots/s / Avg: 1,051,131 slots/s)\n", + "18:37:29 INFO loader :: Index SPO->OSP: 6,000,000 slots (Batch: 826,446 slots/s / Avg: 1,046,389 slots/s)\n", + "18:37:29 INFO loader :: Elapsed: 60.27 seconds [2024/03/11 18:37:29 UTC]\n", + "18:37:29 INFO loader :: Index SPO->OSP: 6,100,000 slots (Batch: 854,700 slots/s / Avg: 1,042,556 slots/s)\n", + "18:37:29 INFO loader :: Index SPO->OSP: 6,200,000 slots (Batch: 847,457 slots/s / Avg: 1,038,699 slots/s)\n", + "18:37:29 INFO loader :: Index SPO->OSP: 6,300,000 slots (Batch: 892,857 slots/s / Avg: 1,036,013 slots/s)\n", + "18:37:29 INFO loader :: Index SPO->OSP: 6,400,000 slots (Batch: 877,192 slots/s / Avg: 1,033,091 slots/s)\n", + "18:37:30 INFO loader :: Index SPO->OSP: 6,500,000 slots (Batch: 952,380 slots/s / Avg: 1,031,746 slots/s)\n", + "18:37:30 INFO loader :: Index SPO->OSP: 6,600,000 slots (Batch: 934,579 slots/s / Avg: 1,030,123 slots/s)\n", + "18:37:30 INFO loader :: Index SPO->OSP: 6,700,000 slots (Batch: 990,099 slots/s / Avg: 1,029,502 slots/s)\n", + "18:37:30 INFO loader :: ** Index SPO->OSP: 6,784,535 slots indexed in 6.60 seconds [Rate: 1,028,427.31 per second]\n", + "18:37:30 INFO loader :: -- Finish triples index phase\n", + "18:37:30 INFO loader :: ** 6,784,535 triples indexed in 14.45 seconds [Rate: 469,518.00 per second]\n", + "18:37:30 INFO loader :: -- Finish triples load\n", + "18:37:30 INFO loader :: ** Completed: 6,784,535 triples loaded in 61.13 seconds [Rate: 110,985.36 per second]\n", + "18:37:30 INFO loader :: -- Finish quads load\n" ] } ], @@ -2184,7 +2351,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 26, "id": "1a0bfb4b-e694-40b1-88af-4446e3fcc888", "metadata": {}, "outputs": [ @@ -2195,8 +2362,6 @@ "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n", " \u001b[32m✔\u001b[0m Container fuseki \u001b[32mCreated\u001b[0m \u001b[34m0.0s \u001b[0m\n", "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 1/1\u001b[0m\n", - " \u001b[32m✔\u001b[0m Container fuseki \u001b[32mCreated\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 1/1\u001b[0m\n", " \u001b[32m✔\u001b[0m Container fuseki \u001b[32mStarted\u001b[0m \u001b[34m0.0s \u001b[0m\n", "\u001b[?25h" ] @@ -2213,6 +2378,14 @@ "source": [ "Now go to and SPARQL it up." ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8695001d-9722-48a0-98e8-9ac5000551ea", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From e483f52b1bea21e3a83759909b2cabcfe68e03fa Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Thu, 14 Mar 2024 13:42:21 -0400 Subject: [PATCH 07/18] feat(sparql): add /nmdcschema/associations endpoint for #401 --- .../notebooks/ghissue_401_sparql.ipynb | 2204 ++++------------- nmdc_runtime/api/endpoints/nmdcschema.py | 98 +- nmdc_runtime/api/endpoints/util.py | 1 + nmdc_runtime/api/models/util.py | 16 +- util/mongodump-nmdc.sh | 4 + 5 files changed, 582 insertions(+), 1741 deletions(-) diff --git a/metadata-translation/notebooks/ghissue_401_sparql.ipynb b/metadata-translation/notebooks/ghissue_401_sparql.ipynb index 4c89ee9f..a9df5a25 100644 --- a/metadata-translation/notebooks/ghissue_401_sparql.ipynb +++ b/metadata-translation/notebooks/ghissue_401_sparql.ipynb @@ -50,9 +50,6 @@ } ], "source": [ - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv(\".env.localhost\")\n", "!env | grep MONGO_HOST" ] }, @@ -89,44 +86,14 @@ "execution_count": 4, "id": "3a0dd489-74cc-47c4-b3e0-c97dd88f5b5f", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['biosample_set',\n", - " 'data_object_set',\n", - " 'extraction_set',\n", - " 'field_research_site_set',\n", - " 'library_preparation_set',\n", - " 'mags_activity_set',\n", - " 'metabolomics_analysis_activity_set',\n", - " 'metagenome_annotation_activity_set',\n", - " 'metagenome_assembly_set',\n", - " 'metagenome_sequencing_activity_set',\n", - " 'metaproteomics_analysis_activity_set',\n", - " 'metatranscriptome_activity_set',\n", - " 'nom_analysis_activity_set',\n", - " 'omics_processing_set',\n", - " 'pooling_set',\n", - " 'processed_sample_set',\n", - " 'read_based_taxonomy_analysis_activity_set',\n", - " 'read_qc_analysis_activity_set',\n", - " 'study_set']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from nmdc_runtime.util import schema_collection_names_with_id_field\n", "\n", "populated_collections = sorted([\n", " name for name in set(schema_collection_names_with_id_field()) & set(mdb.list_collection_names())\n", " if mdb[name].estimated_document_count() > 0\n", - "])\n", - "populated_collections" + "])" ] }, { @@ -144,1185 +111,7 @@ "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'@vocab': 'https://w3id.org/nmdc/',\n", - " 'CATH': 'https://bioregistry.io/cath:',\n", - " 'CHEBI': {'@prefix': True},\n", - " 'CHEMBL.COMPOUND': 'https://bioregistry.io/chembl.compound:',\n", - " 'CHMO': {'@prefix': True},\n", - " 'COG': 'https://bioregistry.io/cog:',\n", - " 'Contaminant': 'http://example.org/contaminant/',\n", - " 'CreditAssociation': {},\n", - " 'DRUGBANK': 'https://bioregistry.io/drugbank:',\n", - " 'EC': 'https://bioregistry.io/eccode:',\n", - " 'EFO': 'http://www.ebi.ac.uk/efo/',\n", - " 'EGGNOG': 'https://bioregistry.io/eggnog:',\n", - " 'ENVO': {'@prefix': True},\n", - " 'FBcv': {'@prefix': True},\n", - " 'FMA': {'@prefix': True},\n", - " 'GO': {'@prefix': True},\n", - " 'HMDB': 'https://bioregistry.io/hmdb:',\n", - " 'ISA': 'http://example.org/isa/',\n", - " 'KEGG.COMPOUND': 'https://bioregistry.io/kegg.compound:',\n", - " 'KEGG.ORTHOLOGY': 'https://bioregistry.io/kegg.orthology:',\n", - " 'KEGG.REACTION': 'https://bioregistry.io/kegg.reaction:',\n", - " 'KEGG_PATHWAY': 'https://bioregistry.io/kegg.pathway:',\n", - " 'MASSIVE': 'https://bioregistry.io/reference/massive:',\n", - " 'MESH': 'https://bioregistry.io/mesh:',\n", - " 'MIXS': 'https://w3id.org/mixs/',\n", - " 'MIXS_yaml': 'https://raw.githubusercontent.com/microbiomedata/mixs/main/model/schema/',\n", - " 'MS': {'@prefix': True},\n", - " 'MetaCyc': 'https://bioregistry.io/metacyc.compound:',\n", - " 'MetaNetX': 'http://example.org/metanetx/',\n", - " 'NCBITaxon': {'@prefix': True},\n", - " 'NCIT': {'@prefix': True},\n", - " 'OBI': {'@prefix': True},\n", - " 'ORCID': 'https://orcid.org/',\n", - " 'PANTHER.FAMILY': 'https://bioregistry.io/panther.family:',\n", - " 'PATO': {'@prefix': True},\n", - " 'PFAM': 'https://bioregistry.io/pfam:',\n", - " 'PO': {'@prefix': True},\n", - " 'PR': {'@prefix': True},\n", - " 'PUBCHEM.COMPOUND': 'https://bioregistry.io/pubchem.compound:',\n", - " 'PlannedProcess': {},\n", - " 'RHEA': 'https://bioregistry.io/rhea:',\n", - " 'RO': {'@prefix': True},\n", - " 'RetroRules': 'http://example.org/retrorules/',\n", - " 'SEED': 'https://bioregistry.io/seed:',\n", - " 'SIO': {'@prefix': True},\n", - " 'SUPFAM': 'https://bioregistry.io/supfam:',\n", - " 'TIGRFAM': 'https://bioregistry.io/tigrfam:',\n", - " 'UBERON': {'@prefix': True},\n", - " 'UO': {'@prefix': True},\n", - " 'UniProtKB': 'https://bioregistry.io/uniprot:',\n", - " 'abs_air_humidity': {'@type': '@id'},\n", - " 'activity_set': {'@type': '@id'},\n", - " 'add_recov_method': {'@type': '@id'},\n", - " 'additional_info': {'@type': '@id'},\n", - " 'address': {'@type': '@id'},\n", - " 'adj_room': {'@type': '@id'},\n", - " 'aero_struc': {'@type': '@id'},\n", - " 'agrochem_addition': {'@type': '@id'},\n", - " 'air_PM_concen': {'@type': '@id'},\n", - " 'air_temp': {'@type': '@id'},\n", - " 'air_temp_regm': {'@type': '@id'},\n", - " 'al_sat': {'@type': '@id'},\n", - " 'al_sat_meth': {'@type': '@id'},\n", - " 'alkalinity': {'@type': '@id'},\n", - " 'alkalinity_method': {'@type': '@id'},\n", - " 'alkyl_diethers': {'@type': '@id'},\n", - " 'all_proteins': {'@type': '@id'},\n", - " 'alt': {'@type': '@id'},\n", - " 'alternative_identifiers': {'@type': '@id'},\n", - " 'aminopept_act': {'@type': '@id'},\n", - " 'ammonium': {'@type': '@id'},\n", - " 'ammonium_nitrogen': {'@type': '@id'},\n", - " 'amount_light': {'@type': '@id'},\n", - " 'analysis_identifiers': {'@type': '@id'},\n", - " 'analysis_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'ances_data': {'@type': '@id'},\n", - " 'annual_precpt': {'@type': '@id'},\n", - " 'annual_temp': {'@type': '@id'},\n", - " 'antibiotic_regm': {'@type': '@id'},\n", - " 'api': {'@type': '@id'},\n", - " 'applied_roles': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'applies_to_person': {'@type': '@id'},\n", - " 'arch_struc': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'aromatics_pc': {'@type': '@id'},\n", - " 'asm_score': {'@type': 'xsd:float'},\n", - " 'asphaltenes_pc': {'@type': '@id'},\n", - " 'associated_dois': {'@type': '@id'},\n", - " 'atmospheric_data': {'@type': '@id'},\n", - " 'avg_dew_point': {'@type': '@id'},\n", - " 'avg_occup': {'@type': '@id'},\n", - " 'avg_temp': {'@type': '@id'},\n", - " 'bac_prod': {'@type': '@id'},\n", - " 'bac_resp': {'@type': '@id'},\n", - " 'bacteria_carb_prod': {'@type': '@id'},\n", - " 'barometric_press': {'@type': '@id'},\n", - " 'basin': {'@type': '@id'},\n", - " 'bathroom_count': {'@type': '@id'},\n", - " 'bedroom_count': {'@type': '@id'},\n", - " 'benzene': {'@type': '@id'},\n", - " 'best_protein': {'@type': '@id'},\n", - " 'binned_contig_num': {'@type': 'xsd:integer'},\n", - " 'biochem_oxygen_dem': {'@type': '@id'},\n", - " 'biocide': {'@type': '@id'},\n", - " 'biocide_admin_method': {'@type': '@id'},\n", - " 'biol_stat': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'biolink': 'https://w3id.org/biolink/vocab/',\n", - " 'biomass': {'@type': '@id'},\n", - " 'biomaterial_purity': {'@type': '@id'},\n", - " 'bioproject': 'https://identifiers.org/bioproject:',\n", - " 'biosample': 'https://bioregistry.io/biosample:',\n", - " 'biosample_categories': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'biosample_identifiers': {'@type': '@id'},\n", - " 'biosample_set': {'@type': '@id'},\n", - " 'biotic_regm': {'@type': '@id'},\n", - " 'biotic_relationship': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'bishomohopanol': {'@type': '@id'},\n", - " 'blood_press_diast': {'@type': '@id'},\n", - " 'blood_press_syst': {'@type': '@id'},\n", - " 'bromide': {'@type': '@id'},\n", - " 'build_docs': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'build_occup_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'building_setting': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'built_struc_age': {'@type': '@id'},\n", - " 'built_struc_set': {'@type': '@id'},\n", - " 'built_struc_type': {'@type': '@id'},\n", - " 'bulk_elect_conductivity': {'@type': '@id'},\n", - " 'calcium': {'@type': '@id'},\n", - " 'carb_dioxide': {'@type': '@id'},\n", - " 'carb_monoxide': {'@type': '@id'},\n", - " 'carb_nitro_ratio': {'@type': '@id'},\n", - " 'cas': 'https://bioregistry.io/cas:',\n", - " 'ceil_area': {'@type': '@id'},\n", - " 'ceil_cond': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'ceil_finish_mat': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'ceil_struc': {'@type': '@id'},\n", - " 'ceil_texture': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'ceil_thermal_mass': {'@type': '@id'},\n", - " 'ceil_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'ceil_water_mold': {'@type': '@id'},\n", - " 'chem_administration': {'@type': '@id'},\n", - " 'chem_mutagen': {'@type': '@id'},\n", - " 'chem_oxygen_dem': {'@type': '@id'},\n", - " 'chem_treat_method': {},\n", - " 'chem_treatment': {'@type': '@id'},\n", - " 'chemical': {'@type': '@id'},\n", - " 'chimera_check': {'@type': '@id'},\n", - " 'chloride': {'@type': '@id'},\n", - " 'chlorophyll': {'@type': '@id'},\n", - " 'climate_environment': {'@type': '@id'},\n", - " 'collected_from': {'@type': '@id'},\n", - " 'collecting_biosamples_from_site_set': {'@type': '@id'},\n", - " 'collection_date': {'@type': '@id'},\n", - " 'completeness': {'@type': 'xsd:float'},\n", - " 'compound': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'concentration': {'@type': '@id'},\n", - " 'conduc': {'@type': '@id'},\n", - " 'contained_in': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'container_size': {'@type': '@id'},\n", - " 'contamination': {'@type': 'xsd:float'},\n", - " 'contig_bp': {'@type': 'xsd:float'},\n", - " 'contigs': {'@type': 'xsd:float'},\n", - " 'cool_syst_id': {'@type': '@id'},\n", - " 'count': {'@type': 'xsd:integer'},\n", - " 'crop_rotation': {'@type': '@id'},\n", - " 'ctg_l50': {'@type': 'xsd:float'},\n", - " 'ctg_l90': {'@type': 'xsd:float'},\n", - " 'ctg_logsum': {'@type': 'xsd:float'},\n", - " 'ctg_max': {'@type': 'xsd:float'},\n", - " 'ctg_n50': {'@type': 'xsd:float'},\n", - " 'ctg_n90': {'@type': 'xsd:float'},\n", - " 'ctg_powsum': {'@type': 'xsd:float'},\n", - " 'cult_root_med': {'@type': '@id'},\n", - " 'cur_land_use': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'cur_vegetation': {'@type': '@id'},\n", - " 'cur_vegetation_meth': {'@type': '@id'},\n", - " 'data_object_set': {'@type': '@id'},\n", - " 'data_object_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'date_last_rain': {'@type': '@id'},\n", - " 'dcterms': 'http://purl.org/dc/terms/',\n", - " 'density': {'@type': '@id'},\n", - " 'depos_env': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'depth': {'@type': '@id'},\n", - " 'description': {},\n", - " 'designated_class': {'@type': '@id'},\n", - " 'dew_point': {'@type': '@id'},\n", - " 'diether_lipids': {'@type': '@id'},\n", - " 'display_order': {'@type': 'xsd:integer'},\n", - " 'diss_carb_dioxide': {'@type': '@id'},\n", - " 'diss_hydrogen': {'@type': '@id'},\n", - " 'diss_inorg_carb': {'@type': '@id'},\n", - " 'diss_inorg_nitro': {'@type': '@id'},\n", - " 'diss_inorg_phosp': {'@type': '@id'},\n", - " 'diss_iron': {'@type': '@id'},\n", - " 'diss_org_carb': {'@type': '@id'},\n", - " 'diss_org_nitro': {'@type': '@id'},\n", - " 'diss_oxygen': {'@type': '@id'},\n", - " 'diss_oxygen_fluid': {'@type': '@id'},\n", - " 'dna_absorb1': {'@type': 'xsd:float'},\n", - " 'dna_absorb2': {'@type': 'xsd:float'},\n", - " 'dna_concentration': {'@type': 'xsd:float'},\n", - " 'dna_cont_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'dna_dnase': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'dna_sample_format': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'dna_volume': {'@type': 'xsd:float'},\n", - " 'dnase_rna': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'doi': 'https://bioregistry.io/doi:',\n", - " 'doi_category': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'doi_provider': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'doi_value': {'@type': '@id'},\n", - " 'door_comp_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'door_cond': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'door_direct': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'door_loc': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'door_mat': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'door_move': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'door_size': {'@type': '@id'},\n", - " 'door_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'door_type_metal': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'door_type_wood': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'door_water_mold': {'@type': '@id'},\n", - " 'down_par': {'@type': '@id'},\n", - " 'drainage_class': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'drawings': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'duration': {'@type': '@id'},\n", - " 'edam.data': {'@prefix': True},\n", - " 'efficiency_percent': {'@type': '@id'},\n", - " 'elev': {'@type': '@id'},\n", - " 'elevator': {'@type': '@id'},\n", - " 'email': {},\n", - " 'embargoed': {'@type': 'xsd:boolean'},\n", - " 'emsl': 'http://example.org/emsl_in_mongodb/',\n", - " 'emsl.project': 'https://bioregistry.io/emsl.project:',\n", - " 'emsl_biosample_identifiers': {'@type': '@id'},\n", - " 'emsl_project_identifiers': {'@type': '@id'},\n", - " 'emsl_uuid_like': 'http://example.org/emsl_uuid_like/',\n", - " 'emulsions': {'@type': '@id'},\n", - " 'encodes': {'@type': '@id'},\n", - " 'end': {'@type': 'xsd:integer'},\n", - " 'env_broad_scale': {'@type': '@id'},\n", - " 'env_local_scale': {'@type': '@id'},\n", - " 'env_medium': {'@type': '@id'},\n", - " 'env_package': {'@type': '@id'},\n", - " 'escalator': {'@type': '@id'},\n", - " 'ethylbenzene': {'@type': '@id'},\n", - " 'exp_duct': {'@type': '@id'},\n", - " 'exp_pipe': {'@type': '@id'},\n", - " 'experimental_factor': {'@type': '@id'},\n", - " 'ext_door': {'@type': '@id'},\n", - " 'ext_wall_orient': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'ext_window_orient': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'external_database_identifiers': {'@type': '@id'},\n", - " 'extractant': {'@type': '@id'},\n", - " 'extraction_method': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'extraction_set': {'@type': '@id'},\n", - " 'extraction_target': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'extreme_event': {'@type': '@id'},\n", - " 'fao_class': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'fertilizer_regm': {'@type': '@id'},\n", - " 'field': {'@type': '@id'},\n", - " 'field_research_site_set': {'@type': '@id'},\n", - " 'file_size_bytes': {'@type': 'xsd:long'},\n", - " 'filter_pore_size': {'@type': '@id'},\n", - " 'filter_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'fire': {'@type': '@id'},\n", - " 'fireplace_type': {'@type': '@id'},\n", - " 'flooding': {'@type': '@id'},\n", - " 'floor_age': {'@type': '@id'},\n", - " 'floor_area': {'@type': '@id'},\n", - " 'floor_cond': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'floor_count': {'@type': '@id'},\n", - " 'floor_finish_mat': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'floor_struc': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'floor_thermal_mass': {'@type': '@id'},\n", - " 'floor_water_mold': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'fluor': {'@type': '@id'},\n", - " 'freq_clean': {'@type': '@id'},\n", - " 'freq_cook': {'@type': '@id'},\n", - " 'functional_annotation_agg': {'@type': '@id'},\n", - " 'functional_annotation_set': {'@type': '@id'},\n", - " 'fungicide_regm': {'@type': '@id'},\n", - " 'furniture': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'gap_pct': {'@type': 'xsd:float'},\n", - " 'gaseous_environment': {'@type': '@id'},\n", - " 'gaseous_substances': {'@type': '@id'},\n", - " 'gc_avg': {'@type': 'xsd:float'},\n", - " 'gc_std': {'@type': 'xsd:float'},\n", - " 'gender_restroom': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'gene_count': {'@type': 'xsd:integer'},\n", - " 'gene_function_id': {'@type': '@id'},\n", - " 'generic': 'https://example.org/generic/',\n", - " 'genetic_mod': {'@type': '@id'},\n", - " 'genome_feature_set': {'@type': '@id'},\n", - " 'geo_loc_name': {'@type': '@id'},\n", - " 'gff_coordinate': {'@type': 'xsd:integer'},\n", - " 'glucosidase_act': {'@type': '@id'},\n", - " 'gnps.task': 'https://bioregistry.io/gnps.task:',\n", - " 'gnps_task_identifiers': {'@type': '@id'},\n", - " 'gold': 'https://bioregistry.io/gold:',\n", - " 'gold_analysis_project_identifiers': {'@type': '@id'},\n", - " 'gold_biosample_identifiers': {'@type': '@id'},\n", - " 'gold_sequencing_project_identifiers': {'@type': '@id'},\n", - " 'gold_study_identifiers': {'@type': '@id'},\n", - " 'gravidity': {'@type': '@id'},\n", - " 'gravity': {'@type': '@id'},\n", - " 'growth_facil': {'@type': '@id'},\n", - " 'growth_habit': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'growth_hormone_regm': {'@type': '@id'},\n", - " 'gtpo': 'http://example.org/gtpo/',\n", - " 'hall_count': {'@type': '@id'},\n", - " 'handidness': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'has_boolean_value': {'@type': 'xsd:boolean'},\n", - " 'has_credit_associations': {'@type': '@id'},\n", - " 'has_failure_categorization': {'@type': '@id'},\n", - " 'has_input': {'@type': '@id'},\n", - " 'has_maximum_numeric_value': {'@type': 'xsd:float'},\n", - " 'has_metabolite_quantifications': {'@type': '@id'},\n", - " 'has_minimum_numeric_value': {'@type': 'xsd:float'},\n", - " 'has_numeric_value': {'@type': 'xsd:float'},\n", - " 'has_output': {'@type': '@id'},\n", - " 'has_part': {'@type': '@id'},\n", - " 'has_peptide_quantifications': {'@type': '@id'},\n", - " 'has_solution_components': {'@type': '@id'},\n", - " 'hc_produced': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'hcr': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'hcr_fw_salinity': {'@type': '@id'},\n", - " 'hcr_geol_age': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'hcr_pressure': {'@type': '@id'},\n", - " 'hcr_temp': {'@type': '@id'},\n", - " 'heat_cool_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'heat_deliv_loc': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'heat_sys_deliv_meth': {},\n", - " 'heat_system_id': {'@type': '@id'},\n", - " 'heavy_metals': {'@type': '@id'},\n", - " 'heavy_metals_meth': {'@type': '@id'},\n", - " 'height_carper_fiber': {'@type': '@id'},\n", - " 'herbicide_regm': {'@type': '@id'},\n", - " 'highest_similarity_score': {'@type': 'xsd:float'},\n", - " 'horizon_meth': {'@type': '@id'},\n", - " 'host_age': {'@type': '@id'},\n", - " 'host_body_habitat': {'@type': '@id'},\n", - " 'host_body_product': {'@type': '@id'},\n", - " 'host_body_site': {'@type': '@id'},\n", - " 'host_body_temp': {'@type': '@id'},\n", - " 'host_color': {'@type': '@id'},\n", - " 'host_common_name': {'@type': '@id'},\n", - " 'host_diet': {'@type': '@id'},\n", - " 'host_disease_stat': {'@type': '@id'},\n", - " 'host_dry_mass': {'@type': '@id'},\n", - " 'host_family_relation': {},\n", - " 'host_genotype': {'@type': '@id'},\n", - " 'host_growth_cond': {'@type': '@id'},\n", - " 'host_height': {'@type': '@id'},\n", - " 'host_last_meal': {'@type': '@id'},\n", - " 'host_length': {'@type': '@id'},\n", - " 'host_life_stage': {'@type': '@id'},\n", - " 'host_phenotype': {'@type': '@id'},\n", - " 'host_sex': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'host_shape': {'@type': '@id'},\n", - " 'host_subject_id': {'@type': '@id'},\n", - " 'host_subspecf_genlin': {},\n", - " 'host_substrate': {'@type': '@id'},\n", - " 'host_symbiont': {},\n", - " 'host_taxid': {'@type': '@id'},\n", - " 'host_tot_mass': {'@type': '@id'},\n", - " 'host_wet_mass': {'@type': '@id'},\n", - " 'humidity': {'@type': '@id'},\n", - " 'humidity_regm': {'@type': '@id'},\n", - " 'id': '@id',\n", - " 'igsn': 'https://app.geosamples.org/sample/igsn/',\n", - " 'igsn_biosample_identifiers': {'@type': '@id'},\n", - " 'img.taxon': 'https://bioregistry.io/img.taxon:',\n", - " 'img_identifiers': {'@type': '@id'},\n", - " 'indoor_space': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'indoor_surf': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'indust_eff_percent': {'@type': '@id'},\n", - " 'inorg_particles': {'@type': '@id'},\n", - " 'input_base_count': {'@type': 'xsd:float'},\n", - " 'input_contig_num': {'@type': 'xsd:integer'},\n", - " 'input_mass': {'@type': '@id'},\n", - " 'input_read_bases': {'@type': 'xsd:float'},\n", - " 'input_read_count': {'@type': 'xsd:float'},\n", - " 'input_volume': {'@type': '@id'},\n", - " 'insdc_analysis_identifiers': {'@type': '@id'},\n", - " 'insdc_bioproject_identifiers': {'@type': '@id'},\n", - " 'insdc_biosample_identifiers': {'@type': '@id'},\n", - " 'insdc_experiment_identifiers': {'@type': '@id'},\n", - " 'insdc_secondary_sample_identifiers': {'@type': '@id'},\n", - " 'insdc_sra_ena_study_identifiers': {'@type': '@id'},\n", - " 'inside_lux': {'@type': '@id'},\n", - " 'int_wall_cond': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'is_balanced': {'@type': 'xsd:boolean'},\n", - " 'is_diastereoselective': {'@type': 'xsd:boolean'},\n", - " 'is_fully_characterized': {'@type': 'xsd:boolean'},\n", - " 'is_pressurized': {'@type': 'xsd:boolean'},\n", - " 'is_stereo': {'@type': 'xsd:boolean'},\n", - " 'is_transport': {'@type': 'xsd:boolean'},\n", - " 'iw_bt_date_well': {'@type': '@id'},\n", - " 'iwf': {'@type': '@id'},\n", - " 'jgi': 'http://example.org/jgi/',\n", - " 'jgi.proposal': 'https://bioregistry.io/jgi.proposal:',\n", - " 'jgi_portal_study_identifiers': {'@type': '@id'},\n", - " 'kegg': 'https://bioregistry.io/kegg:',\n", - " 'language': {'@type': 'xsd:language'},\n", - " 'last_clean': {'@type': '@id'},\n", - " 'lat_lon': {'@type': '@id'},\n", - " 'latitude': {'@type': 'xsd:decimal'},\n", - " 'lbc_thirty': {'@type': '@id'},\n", - " 'lbceq': {'@type': '@id'},\n", - " 'left_participants': {'@type': '@id'},\n", - " 'library_preparation_set': {'@type': '@id'},\n", - " 'library_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'light_intensity': {'@type': '@id'},\n", - " 'light_regm': {'@type': '@id'},\n", - " 'light_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'link_addit_analys': {'@type': '@id'},\n", - " 'link_class_info': {'@type': '@id'},\n", - " 'link_climate_info': {'@type': '@id'},\n", - " 'linkml': 'https://w3id.org/linkml/',\n", - " 'lithology': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'local_class': {'@type': '@id'},\n", - " 'local_class_meth': {'@type': '@id'},\n", - " 'longitude': {'@type': 'xsd:decimal'},\n", - " 'low_depth_contig_num': {'@type': 'xsd:integer'},\n", - " 'magnesium': {'@type': '@id'},\n", - " 'mags_activity_set': {'@type': '@id'},\n", - " 'mags_list': {'@type': '@id'},\n", - " 'manganese': {'@type': '@id'},\n", - " 'mass': {'@type': '@id'},\n", - " 'max_occup': {'@type': '@id'},\n", - " 'mean_frict_vel': {'@type': '@id'},\n", - " 'mean_peak_frict_vel': {'@type': '@id'},\n", - " 'mech_struc': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'mechanical_damage': {'@type': '@id'},\n", - " 'metabolite_quantified': {'@type': '@id'},\n", - " 'metabolomics_analysis_activity_set': {'@type': '@id'},\n", - " 'metagenome_annotation_activity_set': {'@type': '@id'},\n", - " 'metagenome_annotation_id': {'@type': '@id'},\n", - " 'metagenome_assembly_set': {'@type': '@id'},\n", - " 'metagenome_sequencing_activity_set': {'@type': '@id'},\n", - " 'metaproteomics_analysis_activity_set': {'@type': '@id'},\n", - " 'metatranscriptome_activity_set': {'@type': '@id'},\n", - " 'methane': {'@type': '@id'},\n", - " 'mgnify.proj': 'https://bioregistry.io/mgnify.proj:',\n", - " 'mgnify_analysis_identifiers': {'@type': '@id'},\n", - " 'mgnify_project_identifiers': {'@type': '@id'},\n", - " 'micro_biomass_meth': {},\n", - " 'microbial_biomass': {'@type': '@id'},\n", - " 'min_q_value': {'@type': 'xsd:float'},\n", - " 'mineral_nutr_regm': {'@type': '@id'},\n", - " 'misc_param': {'@type': '@id'},\n", - " 'model': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'my_emsl': 'https://release.my.emsl.pnnl.gov/released_data/',\n", - " 'n_alkanes': {'@type': '@id'},\n", - " 'neon.identifier': 'http://example.org/neon/identifier/',\n", - " 'neon.schema': 'http://example.org/neon/schema/',\n", - " 'neon_biosample_identifiers': {'@type': '@id'},\n", - " 'neon_study_identifiers': {'@type': '@id'},\n", - " 'nitrate': {'@type': '@id'},\n", - " 'nitrate_nitrogen': {'@type': '@id'},\n", - " 'nitrite': {'@type': '@id'},\n", - " 'nitrite_nitrogen': {'@type': '@id'},\n", - " 'nitro': {'@type': '@id'},\n", - " 'nmdc': 'https://w3id.org/nmdc/',\n", - " 'nom_analysis_activity_set': {'@type': '@id'},\n", - " 'non_min_nutr_regm': {},\n", - " 'nucl_acid_amp': {'@type': '@id'},\n", - " 'nucl_acid_ext': {'@type': '@id'},\n", - " 'num_16s': {'@type': 'xsd:integer'},\n", - " 'num_23s': {'@type': 'xsd:integer'},\n", - " 'num_5s': {'@type': 'xsd:integer'},\n", - " 'num_aligned_reads': {'@type': 'xsd:float'},\n", - " 'num_input_reads': {'@type': 'xsd:float'},\n", - " 'num_t_rna': {'@type': 'xsd:integer'},\n", - " 'number_of_contig': {'@type': 'xsd:integer'},\n", - " 'number_pets': {'@type': '@id'},\n", - " 'number_plants': {'@type': '@id'},\n", - " 'number_resident': {'@type': '@id'},\n", - " 'occup_density_samp': {'@type': '@id'},\n", - " 'occup_document': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'occup_samp': {'@type': '@id'},\n", - " 'omics_processing_identifiers': {'@type': '@id'},\n", - " 'omics_processing_set': {'@type': '@id'},\n", - " 'omics_type': {'@type': '@id'},\n", - " 'ordered_mobile_phases': {'@type': '@id'},\n", - " 'org_carb': {'@type': '@id'},\n", - " 'org_count_qpcr_info': {},\n", - " 'org_matter': {'@type': '@id'},\n", - " 'org_nitro': {'@type': '@id'},\n", - " 'org_particles': {'@type': '@id'},\n", - " 'organism_count': {'@type': '@id'},\n", - " 'output_base_count': {'@type': 'xsd:float'},\n", - " 'output_read_bases': {'@type': 'xsd:float'},\n", - " 'output_read_count': {'@type': 'xsd:float'},\n", - " 'owc_tvdss': {'@type': '@id'},\n", - " 'oxy_stat_samp': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'oxygen': {'@type': '@id'},\n", - " 'part_of': {'@type': '@id'},\n", - " 'part_org_carb': {'@type': '@id'},\n", - " 'part_org_nitro': {'@type': '@id'},\n", - " 'particle_class': {'@type': '@id'},\n", - " 'pcr_cond': {'@type': '@id'},\n", - " 'pcr_cycles': {'@type': 'xsd:integer'},\n", - " 'pcr_primers': {'@type': '@id'},\n", - " 'peptide_sequence_count': {'@type': 'xsd:integer'},\n", - " 'peptide_spectral_count': {'@type': 'xsd:integer'},\n", - " 'peptide_sum_masic_abundance': {'@type': 'xsd:integer'},\n", - " 'permeability': {'@type': '@id'},\n", - " 'perturbation': {'@type': '@id'},\n", - " 'pesticide_regm': {'@type': '@id'},\n", - " 'petroleum_hydrocarb': {'@type': '@id'},\n", - " 'ph': {'@type': 'xsd:double'},\n", - " 'ph_meth': {'@type': '@id'},\n", - " 'ph_regm': {'@type': '@id'},\n", - " 'phaeopigments': {'@type': '@id'},\n", - " 'phase': {'@type': 'xsd:integer'},\n", - " 'phosphate': {'@type': '@id'},\n", - " 'phosplipid_fatt_acid': {'@type': '@id'},\n", - " 'photon_flux': {'@type': '@id'},\n", - " 'planned_process_set': {'@type': '@id'},\n", - " 'plant_growth_med': {'@type': '@id'},\n", - " 'plant_product': {'@type': '@id'},\n", - " 'plant_sex': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'plant_struc': {'@type': '@id'},\n", - " 'pollutants': {'@type': '@id'},\n", - " 'pool_dna_extracts': {'@type': '@id'},\n", - " 'pooling_set': {'@type': '@id'},\n", - " 'porosity': {'@type': '@id'},\n", - " 'potassium': {'@type': '@id'},\n", - " 'pour_point': {'@type': '@id'},\n", - " 'pre_treatment': {'@type': '@id'},\n", - " 'pres_animal_insect': {},\n", - " 'pressure': {'@type': '@id'},\n", - " 'prev_land_use_meth': {},\n", - " 'previous_land_use': {'@type': '@id'},\n", - " 'primary_prod': {'@type': '@id'},\n", - " 'primary_treatment': {'@type': '@id'},\n", - " 'principal_investigator': {'@type': '@id'},\n", - " 'processed_sample_set': {'@type': '@id'},\n", - " 'processing_institution': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'prod_rate': {'@type': '@id'},\n", - " 'prod_start_date': {'@type': '@id'},\n", - " 'profile_position': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'protein_spectral_count': {'@type': 'xsd:integer'},\n", - " 'protein_sum_masic_abundance': {'@type': 'xsd:integer'},\n", - " 'protocol_link': {'@type': '@id'},\n", - " 'prov': 'http://www.w3.org/ns/prov#',\n", - " 'qc_failure_what': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'qc_failure_where': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'qc_status': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'quad_pos': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'qud': 'http://qudt.org/1.1/schema/qudt#',\n", - " 'radiation_regm': {'@type': '@id'},\n", - " 'rainfall_regm': {'@type': '@id'},\n", - " 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',\n", - " 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',\n", - " 'reactor_type': {'@type': '@id'},\n", - " 'read_based_taxonomy_analysis_activity_set': {'@type': '@id'},\n", - " 'read_qc_analysis_activity_set': {'@type': '@id'},\n", - " 'redox_potential': {'@type': '@id'},\n", - " 'rel_air_humidity': {'@type': '@id'},\n", - " 'rel_humidity_out': {'@type': '@id'},\n", - " 'rel_samp_loc': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'reservoir': {'@type': '@id'},\n", - " 'resins_pc': {'@type': '@id'},\n", - " 'right_participants': {'@type': '@id'},\n", - " 'rna_absorb1': {'@type': 'xsd:float'},\n", - " 'rna_absorb2': {'@type': 'xsd:float'},\n", - " 'rna_concentration': {'@type': 'xsd:float'},\n", - " 'rna_cont_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'rna_sample_format': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'rna_volume': {'@type': 'xsd:float'},\n", - " 'room_air_exch_rate': {'@type': '@id'},\n", - " 'room_architec_elem': {},\n", - " 'room_condt': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'room_connected': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'room_count': {'@type': '@id'},\n", - " 'room_dim': {'@type': '@id'},\n", - " 'room_door_dist': {'@type': '@id'},\n", - " 'room_door_share': {'@type': '@id'},\n", - " 'room_hallway': {'@type': '@id'},\n", - " 'room_loc': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'room_moist_dam_hist': {'@type': 'xsd:integer'},\n", - " 'room_net_area': {'@type': '@id'},\n", - " 'room_occup': {'@type': '@id'},\n", - " 'room_samp_pos': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'room_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'room_vol': {'@type': '@id'},\n", - " 'room_wall_share': {'@type': '@id'},\n", - " 'room_window_count': {'@type': 'xsd:integer'},\n", - " 'root_cond': {'@type': '@id'},\n", - " 'root_med_carbon': {'@type': '@id'},\n", - " 'root_med_macronutr': {'@type': '@id'},\n", - " 'root_med_micronutr': {'@type': '@id'},\n", - " 'root_med_ph': {'@type': '@id'},\n", - " 'root_med_regl': {'@type': '@id'},\n", - " 'root_med_solid': {'@type': '@id'},\n", - " 'root_med_suppl': {'@type': '@id'},\n", - " 'salinity': {'@type': '@id'},\n", - " 'salinity_meth': {'@type': '@id'},\n", - " 'salt_regm': {'@type': '@id'},\n", - " 'samp_capt_status': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'samp_collec_device': {},\n", - " 'samp_collec_method': {},\n", - " 'samp_collect_point': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'samp_dis_stage': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'samp_floor': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'samp_loc_corr_rate': {'@type': '@id'},\n", - " 'samp_mat_process': {'@type': '@id'},\n", - " 'samp_md': {'@type': '@id'},\n", - " 'samp_name': {},\n", - " 'samp_preserv': {'@type': '@id'},\n", - " 'samp_room_id': {'@type': '@id'},\n", - " 'samp_size': {'@type': '@id'},\n", - " 'samp_sort_meth': {'@type': '@id'},\n", - " 'samp_store_dur': {'@type': '@id'},\n", - " 'samp_store_loc': {'@type': '@id'},\n", - " 'samp_store_temp': {'@type': '@id'},\n", - " 'samp_subtype': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'samp_taxon_id': {'@type': '@id'},\n", - " 'samp_time_out': {'@type': '@id'},\n", - " 'samp_transport_cond': {'@type': '@id'},\n", - " 'samp_tvdss': {'@type': '@id'},\n", - " 'samp_type': {'@type': '@id'},\n", - " 'samp_vol_we_dna_ext': {'@type': '@id'},\n", - " 'samp_weather': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'samp_well_name': {'@type': '@id'},\n", - " 'sample_collection_day': {'@type': 'xsd:integer'},\n", - " 'sample_collection_hour': {'@type': 'xsd:integer'},\n", - " 'sample_collection_minute': {'@type': 'xsd:integer'},\n", - " 'sample_collection_year': {'@type': 'xsd:integer'},\n", - " 'sample_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'saturates_pc': {'@type': '@id'},\n", - " 'scaf_bp': {'@type': 'xsd:float'},\n", - " 'scaf_l50': {'@type': 'xsd:float'},\n", - " 'scaf_l90': {'@type': 'xsd:float'},\n", - " 'scaf_l_gt50k': {'@type': 'xsd:float'},\n", - " 'scaf_logsum': {'@type': 'xsd:float'},\n", - " 'scaf_max': {'@type': 'xsd:float'},\n", - " 'scaf_n50': {'@type': 'xsd:float'},\n", - " 'scaf_n90': {'@type': 'xsd:float'},\n", - " 'scaf_n_gt50k': {'@type': 'xsd:float'},\n", - " 'scaf_pct_gt50k': {'@type': 'xsd:float'},\n", - " 'scaf_powsum': {'@type': 'xsd:float'},\n", - " 'scaffolds': {'@type': 'xsd:float'},\n", - " 'schema': 'http://schema.org/',\n", - " 'season': {'@type': '@id'},\n", - " 'season_environment': {'@type': '@id'},\n", - " 'season_precpt': {'@type': '@id'},\n", - " 'season_temp': {'@type': '@id'},\n", - " 'season_use': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'secondary_treatment': {'@type': '@id'},\n", - " 'sediment_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'separation_method': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'seq_meth': {'@type': '@id'},\n", - " 'seq_quality_check': {'@type': '@id'},\n", - " 'sewage_type': {'@type': '@id'},\n", - " 'shad_dev_water_mold': {},\n", - " 'shading_device_cond': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'shading_device_loc': {'@type': '@id'},\n", - " 'shading_device_mat': {'@type': '@id'},\n", - " 'shading_device_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'shex': 'http://www.w3.org/ns/shex#',\n", - " 'sieving': {'@type': '@id'},\n", - " 'silicate': {'@type': '@id'},\n", - " 'size_frac': {'@type': '@id'},\n", - " 'size_frac_low': {'@type': '@id'},\n", - " 'size_frac_up': {'@type': '@id'},\n", - " 'skos': 'http://www.w3.org/2004/02/skos/core#',\n", - " 'slope_aspect': {'@type': '@id'},\n", - " 'slope_gradient': {'@type': '@id'},\n", - " 'sludge_retent_time': {'@type': '@id'},\n", - " 'sodium': {'@type': '@id'},\n", - " 'soil_horizon': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'soil_text_measure': {'@type': '@id'},\n", - " 'soil_texture_meth': {},\n", - " 'soil_type': {'@type': '@id'},\n", - " 'soil_type_meth': {'@type': '@id'},\n", - " 'solar_irradiance': {'@type': '@id'},\n", - " 'soluble_inorg_mat': {'@type': '@id'},\n", - " 'soluble_org_mat': {'@type': '@id'},\n", - " 'soluble_react_phosp': {'@type': '@id'},\n", - " 'source_mat_id': {'@type': '@id'},\n", - " 'space_typ_state': {'@type': '@id'},\n", - " 'specific': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'specific_humidity': {'@type': '@id'},\n", - " 'sr_dep_env': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'sr_geol_age': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'sr_kerog_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'sr_lithology': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'standing_water_regm': {'@type': '@id'},\n", - " 'start': {'@type': 'xsd:integer'},\n", - " 'stationary_phase': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'stoichiometry': {'@type': 'xsd:integer'},\n", - " 'store_cond': {'@type': '@id'},\n", - " 'study_category': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'study_identifiers': {'@type': '@id'},\n", - " 'study_image': {'@type': '@id'},\n", - " 'study_set': {'@type': '@id'},\n", - " 'subject': {'@type': '@id'},\n", - " 'substructure_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'subsurface_depth': {'@type': '@id'},\n", - " 'sulfate': {'@type': '@id'},\n", - " 'sulfate_fw': {'@type': '@id'},\n", - " 'sulfide': {'@type': '@id'},\n", - " 'surf_air_cont': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'surf_humidity': {'@type': '@id'},\n", - " 'surf_material': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'surf_moisture': {'@type': '@id'},\n", - " 'surf_moisture_ph': {'@type': 'xsd:double'},\n", - " 'surf_temp': {'@type': '@id'},\n", - " 'suspend_part_matter': {'@type': '@id'},\n", - " 'suspend_solids': {'@type': '@id'},\n", - " 'tan': {'@type': '@id'},\n", - " 'target_gene': {'@type': '@id'},\n", - " 'target_subfragment': {'@type': '@id'},\n", - " 'temp': {'@type': '@id'},\n", - " 'temp_out': {'@type': '@id'},\n", - " 'temperature': {'@type': '@id'},\n", - " 'term': {'@type': '@id'},\n", - " 'tertiary_treatment': {'@type': '@id'},\n", - " 'tidal_stage': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'tillage': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'tiss_cult_growth_med': {'@type': '@id'},\n", - " 'toluene': {'@type': '@id'},\n", - " 'too_short_contig_num': {'@type': 'xsd:integer'},\n", - " 'tot_carb': {'@type': '@id'},\n", - " 'tot_depth_water_col': {'@type': '@id'},\n", - " 'tot_diss_nitro': {'@type': '@id'},\n", - " 'tot_inorg_nitro': {'@type': '@id'},\n", - " 'tot_iron': {'@type': '@id'},\n", - " 'tot_nitro': {'@type': '@id'},\n", - " 'tot_nitro_cont_meth': {},\n", - " 'tot_nitro_content': {'@type': '@id'},\n", - " 'tot_org_c_meth': {'@type': '@id'},\n", - " 'tot_org_carb': {'@type': '@id'},\n", - " 'tot_part_carb': {'@type': '@id'},\n", - " 'tot_phosp': {'@type': '@id'},\n", - " 'tot_phosphate': {'@type': '@id'},\n", - " 'tot_sulfur': {'@type': '@id'},\n", - " 'total_bases': {'@type': 'xsd:integer'},\n", - " 'train_line': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'train_stat_loc': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'train_stop_loc': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'turbidity': {'@type': '@id'},\n", - " 'tvdss_of_hcr_press': {'@type': '@id'},\n", - " 'tvdss_of_hcr_temp': {'@type': '@id'},\n", - " 'typ_occup_density': {'@type': 'xsd:double'},\n", - " 'unbinned_contig_num': {'@type': 'xsd:integer'},\n", - " 'value': {'@type': '@id'},\n", - " 'vendor': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'ventilation_rate': {'@type': '@id'},\n", - " 'ventilation_type': {'@type': '@id'},\n", - " 'vfa': {'@type': '@id'},\n", - " 'vfa_fw': {'@type': '@id'},\n", - " 'vis_media': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'viscosity': {'@type': '@id'},\n", - " 'volatile_org_comp': {'@type': '@id'},\n", - " 'volume': {'@type': '@id'},\n", - " 'wall_area': {'@type': '@id'},\n", - " 'wall_const_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'wall_finish_mat': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'wall_height': {'@type': '@id'},\n", - " 'wall_loc': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'wall_surf_treatment': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'wall_texture': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'wall_thermal_mass': {'@type': '@id'},\n", - " 'wall_water_mold': {'@type': '@id'},\n", - " 'was_generated_by': {'@type': '@id'},\n", - " 'was_informed_by': {'@type': '@id'},\n", - " 'wastewater_type': {'@type': '@id'},\n", - " 'water_cont_soil_meth': {},\n", - " 'water_content': {'@type': '@id'},\n", - " 'water_current': {'@type': '@id'},\n", - " 'water_cut': {'@type': '@id'},\n", - " 'water_feat_size': {'@type': '@id'},\n", - " 'water_feat_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'water_prod_rate': {'@type': '@id'},\n", - " 'water_temp_regm': {'@type': '@id'},\n", - " 'watering_regm': {'@type': '@id'},\n", - " 'weekday': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'wgs84': 'http://www.w3.org/2003/01/geo/wgs84_pos#',\n", - " 'wikidata': 'http://www.wikidata.org/entity/',\n", - " 'win': {'@type': '@id'},\n", - " 'wind_direction': {'@type': '@id'},\n", - " 'wind_speed': {'@type': '@id'},\n", - " 'window_cond': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'window_cover': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'window_horiz_pos': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'window_loc': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'window_mat': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'window_open_freq': {'@type': '@id'},\n", - " 'window_size': {'@type': '@id'},\n", - " 'window_status': {'@type': '@id'},\n", - " 'window_type': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'window_vert_pos': {'@context': {'@vocab': '@null',\n", - " 'description': 'skos:prefLabel',\n", - " 'meaning': '@id',\n", - " 'text': 'skos:notation'}},\n", - " 'window_water_mold': {'@type': '@id'},\n", - " 'xsd': 'http://www.w3.org/2001/XMLSchema#',\n", - " 'xylene': {'@type': '@id'},\n", - " 'zinc': {'@type': '@id'}}\n" - ] - } - ], + "outputs": [], "source": [ "import json\n", "from pprint import pprint\n", @@ -1335,8 +124,7 @@ "\n", "for k, v in list(context.items()):\n", " if isinstance(v, dict): #and v.get(\"@type\") == \"@id\":\n", - " v.pop(\"@id\", None) # use nmdc uri, not e.g. MIXS uri\n", - "pprint(context)" + " v.pop(\"@id\", None) # use nmdc uri, not e.g. MIXS uri" ] }, { @@ -1439,12 +227,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2bc6ca60aa784afa8a3e8c1947be3901", + "model_id": "79f269c4f8d04911988befe5d85b0862", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0/112 [00:00 Activity : was_informed_by\n", - "Biosample --> FieldResearchSite : collected_from\n", - "NamedThing --> NamedThing : part_of\n", - "FunctionalAnnotationAggMember --> WorkflowExecutionActivity : metagenome_annotation_id\n", - "NamedThing --> Activity : was_generated_by\n", - "NamedThing --> NamedThing : has_input\n", - "NamedThing --> NamedThing : has_output\n", - "\n", - "PlannedProcess <|-- BiosampleProcessing\n", - "NamedThing <|-- Extraction\n", - "Activity <|-- NomAnalysisActivity\n", - "Activity <|-- MagsAnalysisActivity\n", - "NamedThing <|-- ProcessedSample\n", - "NamedThing <|-- Site\n", - "PlannedProcess <|-- Extraction\n", - "Activity <|-- MetagenomeAnnotationActivity\n", - "Activity <|-- MetatranscriptomeActivity\n", - "Activity <|-- MetabolomicsAnalysisActivity\n", - "WorkflowExecutionActivity <|-- MetabolomicsAnalysisActivity\n", - "NamedThing <|-- LibraryPreparation\n", - "Activity <|-- MetagenomeAssembly\n", - "WorkflowExecutionActivity <|-- NomAnalysisActivity\n", - "NamedThing <|-- Pooling\n", - "WorkflowExecutionActivity <|-- ReadQcAnalysisActivity\n", - "NamedThing <|-- Biosample\n", - "NamedThing <|-- FieldResearchSite\n", - "MaterialEntity <|-- FieldResearchSite\n", - "NamedThing <|-- PlannedProcess\n", - "NamedThing <|-- BiosampleProcessing\n", - "Site <|-- FieldResearchSite\n", - "Activity <|-- ReadQcAnalysisActivity\n", - "NamedThing <|-- Study\n", - "PlannedProcess <|-- OmicsProcessing\n", - "Activity <|-- ReadBasedTaxonomyAnalysisActivity\n", - "WorkflowExecutionActivity <|-- MetagenomeAnnotationActivity\n", - "WorkflowExecutionActivity <|-- MagsAnalysisActivity\n", - "NamedThing <|-- MaterialEntity\n", - "WorkflowExecutionActivity <|-- MetaproteomicsAnalysisActivity\n", - "BiosampleProcessing <|-- LibraryPreparation\n", - "NamedThing <|-- DataObject\n", - "MaterialEntity <|-- ProcessedSample\n", - "MaterialEntity <|-- Site\n", - "WorkflowExecutionActivity <|-- MetagenomeAssembly\n", - "WorkflowExecutionActivity <|-- ReadBasedTaxonomyAnalysisActivity\n", - "NamedThing <|-- CollectingBiosamplesFromSite\n", - "BiosampleProcessing <|-- Pooling\n", - "PlannedProcess <|-- CollectingBiosamplesFromSite\n", - "Activity <|-- MetaproteomicsAnalysisActivity\n", - "NamedThing <|-- OmicsProcessing\n", - "Activity <|-- MetagenomeSequencingActivity\n", - "MaterialEntity <|-- Biosample\n", - "WorkflowExecutionActivity <|-- MetatranscriptomeActivity\n", - "PlannedProcess <|-- LibraryPreparation\n", - "WorkflowExecutionActivity <|-- MetagenomeSequencingActivity\n", - "PlannedProcess <|-- Pooling\n", - "Activity <|-- WorkflowExecutionActivity\n" - ] - } - ], + "outputs": [], "source": [ - "print(\"classDiagram\\n\")\n", - "for slot_name in toplevel_entity_connectors:\n", - " slot = slots[slot_name]\n", - " domain = slot.domain or \"NamedThing\"\n", - " range = slot.range\n", - " print(f\"{domain} --> {range} : {slot_name}\")\n", + "# print(\"classDiagram\\n\")\n", + "# for slot_name in toplevel_entity_connectors:\n", + "# slot = slots[slot_name]\n", + "# domain = slot.domain or \"NamedThing\"\n", + "# range = slot.range\n", + "# print(f\"{domain} --> {range} : {slot_name}\")\n", "\n", - "print()\n", + "# print()\n", "\n", - "inheritance_links = set()\n", - "for cls in toplevel_classes:\n", - " ancestors = schema_view.class_ancestors(cls)\n", - " for a in ancestors:\n", - " if a != cls:\n", - " inheritance_links.add(f\"{a} <|-- {cls}\")\n", + "# inheritance_links = set()\n", + "# for cls in toplevel_classes:\n", + "# ancestors = schema_view.class_ancestors(cls)\n", + "# for a in ancestors:\n", + "# if a != cls:\n", + "# inheritance_links.add(f\"{a} <|-- {cls}\")\n", "\n", - "for link in inheritance_links:\n", - " print(link)" + "# for link in inheritance_links:\n", + "# print(link)" ] }, { @@ -1744,19 +441,19 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "id": "cc830d77-5ac2-482e-a4f9-dc2eed3f2ef9", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2e6e481cc9a142598b447b765c7a4773", + "model_id": "96a43219b33246d6a74f7add4f057206", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0/6784535 [00:00POS: 100,000 slots (Batch: 1,408,450 slots/s / Avg: 1,408,450 slots/s)\n", - "18:37:16 INFO loader :: Index SPO->POS: 200,000 slots (Batch: 1,298,701 slots/s / Avg: 1,351,351 slots/s)\n", - "18:37:16 INFO loader :: Index SPO->POS: 300,000 slots (Batch: 934,579 slots/s / Avg: 1,176,470 slots/s)\n", - "18:37:16 INFO loader :: Index SPO->POS: 400,000 slots (Batch: 1,219,512 slots/s / Avg: 1,186,943 slots/s)\n", - "18:37:16 INFO loader :: Index SPO->POS: 500,000 slots (Batch: 1,190,476 slots/s / Avg: 1,187,648 slots/s)\n", - "18:37:16 INFO loader :: Index SPO->POS: 600,000 slots (Batch: 909,090 slots/s / Avg: 1,129,943 slots/s)\n", - "18:37:16 INFO loader :: Index SPO->POS: 700,000 slots (Batch: 1,136,363 slots/s / Avg: 1,130,856 slots/s)\n", - "18:37:16 INFO loader :: Index SPO->POS: 800,000 slots (Batch: 1,162,790 slots/s / Avg: 1,134,751 slots/s)\n", - "18:37:16 INFO loader :: Index SPO->POS: 900,000 slots (Batch: 925,925 slots/s / Avg: 1,107,011 slots/s)\n", - "18:37:16 INFO loader :: Index SPO->POS: 1,000,000 slots (Batch: 1,123,595 slots/s / Avg: 1,108,647 slots/s)\n", - "18:37:16 INFO loader :: Elapsed: 47.58 seconds [2024/03/11 18:37:16 UTC]\n", - "18:37:16 INFO loader :: Index SPO->POS: 1,100,000 slots (Batch: 1,162,790 slots/s / Avg: 1,113,360 slots/s)\n", - "18:37:17 INFO loader :: Index SPO->POS: 1,200,000 slots (Batch: 1,123,595 slots/s / Avg: 1,114,206 slots/s)\n", - "18:37:17 INFO loader :: Index SPO->POS: 1,300,000 slots (Batch: 862,068 slots/s / Avg: 1,089,689 slots/s)\n", - "18:37:17 INFO loader :: Index SPO->POS: 1,400,000 slots (Batch: 1,111,111 slots/s / Avg: 1,091,192 slots/s)\n", - "18:37:17 INFO loader :: Index SPO->POS: 1,500,000 slots (Batch: 1,149,425 slots/s / Avg: 1,094,890 slots/s)\n", - "18:37:17 INFO loader :: Index SPO->POS: 1,600,000 slots (Batch: 781,250 slots/s / Avg: 1,068,090 slots/s)\n", - "18:37:17 INFO loader :: Index SPO->POS: 1,700,000 slots (Batch: 884,955 slots/s / Avg: 1,055,245 slots/s)\n", - "18:37:17 INFO loader :: Index SPO->POS: 1,800,000 slots (Batch: 909,090 slots/s / Avg: 1,045,903 slots/s)\n", - "18:37:17 INFO loader :: Index SPO->POS: 1,900,000 slots (Batch: 892,857 slots/s / Avg: 1,036,552 slots/s)\n", - "18:37:17 INFO loader :: Index SPO->POS: 2,000,000 slots (Batch: 746,268 slots/s / Avg: 1,016,776 slots/s)\n", - "18:37:17 INFO loader :: Elapsed: 48.65 seconds [2024/03/11 18:37:17 UTC]\n", - "18:37:18 INFO loader :: Index SPO->POS: 2,100,000 slots (Batch: 869,565 slots/s / Avg: 1,008,645 slots/s)\n", - "18:37:18 INFO loader :: Index SPO->POS: 2,200,000 slots (Batch: 847,457 slots/s / Avg: 1,000,000 slots/s)\n", - "18:37:18 INFO loader :: Index SPO->POS: 2,300,000 slots (Batch: 884,955 slots/s / Avg: 994,379 slots/s)\n", - "18:37:18 INFO loader :: Index SPO->POS: 2,400,000 slots (Batch: 709,219 slots/s / Avg: 977,995 slots/s)\n", - "18:37:18 INFO loader :: Index SPO->POS: 2,500,000 slots (Batch: 862,068 slots/s / Avg: 972,762 slots/s)\n", - "18:37:18 INFO loader :: Index SPO->POS: 2,600,000 slots (Batch: 884,955 slots/s / Avg: 969,064 slots/s)\n", - "18:37:18 INFO loader :: Index SPO->POS: 2,700,000 slots (Batch: 869,565 slots/s / Avg: 964,974 slots/s)\n", - "18:37:18 INFO loader :: Index SPO->POS: 2,800,000 slots (Batch: 724,637 slots/s / Avg: 953,678 slots/s)\n", - "18:37:18 INFO loader :: Index SPO->POS: 2,900,000 slots (Batch: 877,192 slots/s / Avg: 950,819 slots/s)\n", - "18:37:19 INFO loader :: Index SPO->POS: 3,000,000 slots (Batch: 869,565 slots/s / Avg: 947,867 slots/s)\n", - "18:37:19 INFO loader :: Elapsed: 49.85 seconds [2024/03/11 18:37:19 UTC]\n", - "18:37:19 INFO loader :: Index SPO->POS: 3,100,000 slots (Batch: 724,637 slots/s / Avg: 938,540 slots/s)\n", - "18:37:19 INFO loader :: Index SPO->POS: 3,200,000 slots (Batch: 854,700 slots/s / Avg: 935,672 slots/s)\n", - "18:37:19 INFO loader :: Index SPO->POS: 3,300,000 slots (Batch: 869,565 slots/s / Avg: 933,521 slots/s)\n", - "18:37:19 INFO loader :: Index SPO->POS: 3,400,000 slots (Batch: 847,457 slots/s / Avg: 930,741 slots/s)\n", - "18:37:19 INFO loader :: Index SPO->POS: 3,500,000 slots (Batch: 729,927 slots/s / Avg: 923,482 slots/s)\n", - "18:37:19 INFO loader :: Index SPO->POS: 3,600,000 slots (Batch: 877,192 slots/s / Avg: 922,131 slots/s)\n", - "18:37:19 INFO loader :: Index SPO->POS: 3,700,000 slots (Batch: 869,565 slots/s / Avg: 920,627 slots/s)\n", - "18:37:20 INFO loader :: Index SPO->POS: 3,800,000 slots (Batch: 847,457 slots/s / Avg: 918,540 slots/s)\n", - "18:37:20 INFO loader :: Index SPO->POS: 3,900,000 slots (Batch: 740,740 slots/s / Avg: 912,921 slots/s)\n", - "18:37:20 INFO loader :: Index SPO->POS: 4,000,000 slots (Batch: 833,333 slots/s / Avg: 910,746 slots/s)\n", - "18:37:20 INFO loader :: Elapsed: 51.07 seconds [2024/03/11 18:37:20 UTC]\n", - "18:37:20 INFO loader :: Index SPO->POS: 4,100,000 slots (Batch: 826,446 slots/s / Avg: 908,486 slots/s)\n", - "18:37:20 INFO loader :: Index SPO->POS: 4,200,000 slots (Batch: 826,446 slots/s / Avg: 906,344 slots/s)\n", - "18:37:20 INFO loader :: Index SPO->POS: 4,300,000 slots (Batch: 740,740 slots/s / Avg: 901,656 slots/s)\n", - "18:37:20 INFO loader :: Index SPO->POS: 4,400,000 slots (Batch: 877,192 slots/s / Avg: 901,085 slots/s)\n", - "18:37:20 INFO loader :: Index SPO->POS: 4,500,000 slots (Batch: 806,451 slots/s / Avg: 898,741 slots/s)\n", - "18:37:21 INFO loader :: Index SPO->POS: 4,600,000 slots (Batch: 800,000 slots/s / Avg: 896,336 slots/s)\n", - "18:37:21 INFO loader :: Index SPO->POS: 4,700,000 slots (Batch: 781,250 slots/s / Avg: 893,536 slots/s)\n", - "18:37:21 INFO loader :: Index SPO->POS: 4,800,000 slots (Batch: 917,431 slots/s / Avg: 894,021 slots/s)\n", - "18:37:21 INFO loader :: Index SPO->POS: 4,900,000 slots (Batch: 917,431 slots/s / Avg: 894,487 slots/s)\n", - "18:37:21 INFO loader :: Index SPO->POS: 5,000,000 slots (Batch: 869,565 slots/s / Avg: 893,974 slots/s)\n", - "18:37:21 INFO loader :: Elapsed: 52.27 seconds [2024/03/11 18:37:21 UTC]\n", - "18:37:21 INFO loader :: Index SPO->POS: 5,100,000 slots (Batch: 787,401 slots/s / Avg: 891,608 slots/s)\n", - "18:37:21 INFO loader :: Index SPO->POS: 5,200,000 slots (Batch: 917,431 slots/s / Avg: 892,091 slots/s)\n", - "18:37:21 INFO loader :: Index SPO->POS: 5,300,000 slots (Batch: 884,955 slots/s / Avg: 891,955 slots/s)\n", - "18:37:21 INFO loader :: Index SPO->POS: 5,400,000 slots (Batch: 847,457 slots/s / Avg: 891,089 slots/s)\n", - "18:37:22 INFO loader :: Index SPO->POS: 5,500,000 slots (Batch: 793,650 slots/s / Avg: 889,104 slots/s)\n", - "18:37:22 INFO loader :: Index SPO->POS: 5,600,000 slots (Batch: 884,955 slots/s / Avg: 889,030 slots/s)\n", - "18:37:22 INFO loader :: Index SPO->POS: 5,700,000 slots (Batch: 900,900 slots/s / Avg: 889,235 slots/s)\n", - "18:37:22 INFO loader :: Index SPO->POS: 5,800,000 slots (Batch: 769,230 slots/s / Avg: 886,850 slots/s)\n", - "18:37:22 INFO loader :: Index SPO->POS: 5,900,000 slots (Batch: 724,637 slots/s / Avg: 883,498 slots/s)\n", - "18:37:22 INFO loader :: Index SPO->POS: 6,000,000 slots (Batch: 746,268 slots/s / Avg: 880,798 slots/s)\n", - "18:37:22 INFO loader :: Elapsed: 53.49 seconds [2024/03/11 18:37:22 UTC]\n", - "18:37:22 INFO loader :: Index SPO->POS: 6,100,000 slots (Batch: 757,575 slots/s / Avg: 878,456 slots/s)\n", - "18:37:23 INFO loader :: Index SPO->POS: 6,200,000 slots (Batch: 757,575 slots/s / Avg: 876,201 slots/s)\n", - "18:37:23 INFO loader :: Index SPO->POS: 6,300,000 slots (Batch: 746,268 slots/s / Avg: 873,786 slots/s)\n", - "18:37:23 INFO loader :: Index SPO->POS: 6,400,000 slots (Batch: 735,294 slots/s / Avg: 871,222 slots/s)\n", - "18:37:23 INFO loader :: Index SPO->POS: 6,500,000 slots (Batch: 735,294 slots/s / Avg: 868,751 slots/s)\n", - "18:37:23 INFO loader :: Index SPO->POS: 6,600,000 slots (Batch: 751,879 slots/s / Avg: 866,710 slots/s)\n", - "18:37:23 INFO loader :: Index SPO->POS: 6,700,000 slots (Batch: 769,230 slots/s / Avg: 865,074 slots/s)\n", - "18:37:23 INFO loader :: ** Index SPO->POS: 6,784,535 slots indexed in 7.85 seconds [Rate: 864,051.81 per second]\n", - "18:37:23 INFO loader :: Index SPO->OSP: 100,000 slots (Batch: 1,960,784 slots/s / Avg: 1,960,784 slots/s)\n", - "18:37:23 INFO loader :: Index SPO->OSP: 200,000 slots (Batch: 1,639,344 slots/s / Avg: 1,785,714 slots/s)\n", - "18:37:23 INFO loader :: Index SPO->OSP: 300,000 slots (Batch: 1,587,301 slots/s / Avg: 1,714,285 slots/s)\n", - "18:37:24 INFO loader :: Index SPO->OSP: 400,000 slots (Batch: 1,449,275 slots/s / Avg: 1,639,344 slots/s)\n", - "18:37:24 INFO loader :: Index SPO->OSP: 500,000 slots (Batch: 1,408,450 slots/s / Avg: 1,587,301 slots/s)\n", - "18:37:24 INFO loader :: Index SPO->OSP: 600,000 slots (Batch: 1,388,888 slots/s / Avg: 1,550,387 slots/s)\n", - "18:37:24 INFO loader :: Index SPO->OSP: 700,000 slots (Batch: 1,282,051 slots/s / Avg: 1,505,376 slots/s)\n", - "18:37:24 INFO loader :: Index SPO->OSP: 800,000 slots (Batch: 1,204,819 slots/s / Avg: 1,459,854 slots/s)\n", - "18:37:24 INFO loader :: Index SPO->OSP: 900,000 slots (Batch: 1,123,595 slots/s / Avg: 1,412,872 slots/s)\n", - "18:37:24 INFO loader :: Index SPO->OSP: 1,000,000 slots (Batch: 1,075,268 slots/s / Avg: 1,369,863 slots/s)\n", - "18:37:24 INFO loader :: Elapsed: 55.26 seconds [2024/03/11 18:37:24 UTC]\n", - "18:37:24 INFO loader :: Index SPO->OSP: 1,100,000 slots (Batch: 1,111,111 slots/s / Avg: 1,341,463 slots/s)\n", - "18:37:24 INFO loader :: Index SPO->OSP: 1,200,000 slots (Batch: 1,075,268 slots/s / Avg: 1,314,348 slots/s)\n", - "18:37:24 INFO loader :: Index SPO->OSP: 1,300,000 slots (Batch: 1,123,595 slots/s / Avg: 1,297,405 slots/s)\n", - "18:37:24 INFO loader :: Index SPO->OSP: 1,400,000 slots (Batch: 1,020,408 slots/s / Avg: 1,272,727 slots/s)\n", - "18:37:24 INFO loader :: Index SPO->OSP: 1,500,000 slots (Batch: 1,052,631 slots/s / Avg: 1,255,230 slots/s)\n", - "18:37:25 INFO loader :: Index SPO->OSP: 1,600,000 slots (Batch: 1,020,408 slots/s / Avg: 1,237,432 slots/s)\n", - "18:37:25 INFO loader :: Index SPO->OSP: 1,700,000 slots (Batch: 990,099 slots/s / Avg: 1,219,512 slots/s)\n", - "18:37:25 INFO loader :: Index SPO->OSP: 1,800,000 slots (Batch: 1,063,829 slots/s / Avg: 1,209,677 slots/s)\n", - "18:37:25 INFO loader :: Index SPO->OSP: 1,900,000 slots (Batch: 1,041,666 slots/s / Avg: 1,199,494 slots/s)\n", - "18:37:25 INFO loader :: Index SPO->OSP: 2,000,000 slots (Batch: 1,075,268 slots/s / Avg: 1,192,605 slots/s)\n", - "18:37:25 INFO loader :: Elapsed: 56.21 seconds [2024/03/11 18:37:25 UTC]\n", - "18:37:25 INFO loader :: Index SPO->OSP: 2,100,000 slots (Batch: 1,030,927 slots/s / Avg: 1,183,765 slots/s)\n", - "18:37:25 INFO loader :: Index SPO->OSP: 2,200,000 slots (Batch: 1,052,631 slots/s / Avg: 1,177,100 slots/s)\n", - "18:37:25 INFO loader :: Index SPO->OSP: 2,300,000 slots (Batch: 1,030,927 slots/s / Avg: 1,169,888 slots/s)\n", - "18:37:25 INFO loader :: Index SPO->OSP: 2,400,000 slots (Batch: 1,075,268 slots/s / Avg: 1,165,614 slots/s)\n", - "18:37:25 INFO loader :: Index SPO->OSP: 2,500,000 slots (Batch: 980,392 slots/s / Avg: 1,156,871 slots/s)\n", - "18:37:26 INFO loader :: Index SPO->OSP: 2,600,000 slots (Batch: 1,041,666 slots/s / Avg: 1,151,971 slots/s)\n", - "18:37:26 INFO loader :: Index SPO->OSP: 2,700,000 slots (Batch: 970,873 slots/s / Avg: 1,144,067 slots/s)\n", - "18:37:26 INFO loader :: Index SPO->OSP: 2,800,000 slots (Batch: 1,063,829 slots/s / Avg: 1,140,994 slots/s)\n", - "18:37:26 INFO loader :: Index SPO->OSP: 2,900,000 slots (Batch: 1,000,000 slots/s / Avg: 1,135,473 slots/s)\n", - "18:37:26 INFO loader :: Index SPO->OSP: 3,000,000 slots (Batch: 952,380 slots/s / Avg: 1,128,243 slots/s)\n", - "18:37:26 INFO loader :: Elapsed: 57.19 seconds [2024/03/11 18:37:26 UTC]\n", - "18:37:26 INFO loader :: Index SPO->OSP: 3,100,000 slots (Batch: 925,925 slots/s / Avg: 1,120,346 slots/s)\n", - "18:37:26 INFO loader :: Index SPO->OSP: 3,200,000 slots (Batch: 917,431 slots/s / Avg: 1,112,656 slots/s)\n", - "18:37:26 INFO loader :: Index SPO->OSP: 3,300,000 slots (Batch: 909,090 slots/s / Avg: 1,105,157 slots/s)\n", - "18:37:26 INFO loader :: Index SPO->OSP: 3,400,000 slots (Batch: 934,579 slots/s / Avg: 1,099,256 slots/s)\n", - "18:37:26 INFO loader :: Index SPO->OSP: 3,500,000 slots (Batch: 952,380 slots/s / Avg: 1,094,434 slots/s)\n", - "18:37:27 INFO loader :: Index SPO->OSP: 3,600,000 slots (Batch: 934,579 slots/s / Avg: 1,089,258 slots/s)\n", - "18:37:27 INFO loader :: Index SPO->OSP: 3,700,000 slots (Batch: 1,041,666 slots/s / Avg: 1,087,915 slots/s)\n", - "18:37:27 INFO loader :: Index SPO->OSP: 3,800,000 slots (Batch: 1,075,268 slots/s / Avg: 1,087,578 slots/s)\n", - "18:37:27 INFO loader :: Index SPO->OSP: 3,900,000 slots (Batch: 1,010,101 slots/s / Avg: 1,085,443 slots/s)\n", - "18:37:27 INFO loader :: Index SPO->OSP: 4,000,000 slots (Batch: 1,020,408 slots/s / Avg: 1,083,717 slots/s)\n", - "18:37:27 INFO loader :: Elapsed: 58.22 seconds [2024/03/11 18:37:27 UTC]\n", - "18:37:27 INFO loader :: Index SPO->OSP: 4,100,000 slots (Batch: 970,873 slots/s / Avg: 1,080,653 slots/s)\n", - "18:37:27 INFO loader :: Index SPO->OSP: 4,200,000 slots (Batch: 1,000,000 slots/s / Avg: 1,078,582 slots/s)\n", - "18:37:27 INFO loader :: Index SPO->OSP: 4,300,000 slots (Batch: 1,052,631 slots/s / Avg: 1,077,964 slots/s)\n", - "18:37:27 INFO loader :: Index SPO->OSP: 4,400,000 slots (Batch: 1,020,408 slots/s / Avg: 1,076,584 slots/s)\n", - "18:37:27 INFO loader :: Index SPO->OSP: 4,500,000 slots (Batch: 1,000,000 slots/s / Avg: 1,074,755 slots/s)\n", - "18:37:28 INFO loader :: Index SPO->OSP: 4,600,000 slots (Batch: 1,010,101 slots/s / Avg: 1,073,261 slots/s)\n", - "18:37:28 INFO loader :: Index SPO->OSP: 4,700,000 slots (Batch: 952,380 slots/s / Avg: 1,070,371 slots/s)\n", - "18:37:28 INFO loader :: Index SPO->OSP: 4,800,000 slots (Batch: 1,000,000 slots/s / Avg: 1,068,804 slots/s)\n", - "18:37:28 INFO loader :: Index SPO->OSP: 4,900,000 slots (Batch: 1,020,408 slots/s / Avg: 1,067,770 slots/s)\n", - "18:37:28 INFO loader :: Index SPO->OSP: 5,000,000 slots (Batch: 1,041,666 slots/s / Avg: 1,067,235 slots/s)\n", - "18:37:28 INFO loader :: Elapsed: 59.22 seconds [2024/03/11 18:37:28 UTC]\n", - "18:37:28 INFO loader :: Index SPO->OSP: 5,100,000 slots (Batch: 970,873 slots/s / Avg: 1,065,162 slots/s)\n", - "18:37:28 INFO loader :: Index SPO->OSP: 5,200,000 slots (Batch: 1,010,101 slots/s / Avg: 1,064,047 slots/s)\n", - "18:37:28 INFO loader :: Index SPO->OSP: 5,300,000 slots (Batch: 1,010,101 slots/s / Avg: 1,062,976 slots/s)\n", - "18:37:28 INFO loader :: Index SPO->OSP: 5,400,000 slots (Batch: 1,010,101 slots/s / Avg: 1,061,946 slots/s)\n", - "18:37:28 INFO loader :: Index SPO->OSP: 5,500,000 slots (Batch: 970,873 slots/s / Avg: 1,060,138 slots/s)\n", - "18:37:29 INFO loader :: Index SPO->OSP: 5,600,000 slots (Batch: 970,873 slots/s / Avg: 1,058,401 slots/s)\n", - "18:37:29 INFO loader :: Index SPO->OSP: 5,700,000 slots (Batch: 952,380 slots/s / Avg: 1,056,338 slots/s)\n", - "18:37:29 INFO loader :: Index SPO->OSP: 5,800,000 slots (Batch: 980,392 slots/s / Avg: 1,054,929 slots/s)\n", - "18:37:29 INFO loader :: Index SPO->OSP: 5,900,000 slots (Batch: 869,565 slots/s / Avg: 1,051,131 slots/s)\n", - "18:37:29 INFO loader :: Index SPO->OSP: 6,000,000 slots (Batch: 826,446 slots/s / Avg: 1,046,389 slots/s)\n", - "18:37:29 INFO loader :: Elapsed: 60.27 seconds [2024/03/11 18:37:29 UTC]\n", - "18:37:29 INFO loader :: Index SPO->OSP: 6,100,000 slots (Batch: 854,700 slots/s / Avg: 1,042,556 slots/s)\n", - "18:37:29 INFO loader :: Index SPO->OSP: 6,200,000 slots (Batch: 847,457 slots/s / Avg: 1,038,699 slots/s)\n", - "18:37:29 INFO loader :: Index SPO->OSP: 6,300,000 slots (Batch: 892,857 slots/s / Avg: 1,036,013 slots/s)\n", - "18:37:29 INFO loader :: Index SPO->OSP: 6,400,000 slots (Batch: 877,192 slots/s / Avg: 1,033,091 slots/s)\n", - "18:37:30 INFO loader :: Index SPO->OSP: 6,500,000 slots (Batch: 952,380 slots/s / Avg: 1,031,746 slots/s)\n", - "18:37:30 INFO loader :: Index SPO->OSP: 6,600,000 slots (Batch: 934,579 slots/s / Avg: 1,030,123 slots/s)\n", - "18:37:30 INFO loader :: Index SPO->OSP: 6,700,000 slots (Batch: 990,099 slots/s / Avg: 1,029,502 slots/s)\n", - "18:37:30 INFO loader :: ** Index SPO->OSP: 6,784,535 slots indexed in 6.60 seconds [Rate: 1,028,427.31 per second]\n", - "18:37:30 INFO loader :: -- Finish triples index phase\n", - "18:37:30 INFO loader :: ** 6,784,535 triples indexed in 14.45 seconds [Rate: 469,518.00 per second]\n", - "18:37:30 INFO loader :: -- Finish triples load\n", - "18:37:30 INFO loader :: ** Completed: 6,784,535 triples loaded in 61.13 seconds [Rate: 110,985.36 per second]\n", - "18:37:30 INFO loader :: -- Finish quads load\n" + "14:00:20 INFO loader :: -- Start triples data phase\n", + "14:00:20 INFO loader :: ** Load empty triples table\n", + "14:00:20 INFO loader :: -- Start quads data phase\n", + "14:00:20 INFO loader :: ** Load empty quads table\n", + "14:00:20 INFO loader :: Load: /fuseki-base/nmdc-db.nt.gz -- 2024/03/14 14:00:20 UTC\n", + "14:00:21 WARN riot :: [line: 29434, col: 92] Bad IRI: Not a valid UUID string: uuid:DELA-CB-T-13ba6115-12fc-47cc-8cb0-ebf65e1d23d1\n", + "14:00:22 WARN riot :: [line: 86977, col: 92] Bad IRI: Not a valid UUID string: uuid:TEAK-CB-T-0d2245d4-c6da-4723-95be-ca5aefe607de\n", + "14:00:22 INFO loader :: Add: 100,000 triples (Batch: 76,219 / Avg: 76,219)\n", + "14:00:22 WARN riot :: [line: 130251, col: 92] Bad IRI: Not a valid UUID string: uuid:CSF2-CB-T-6d29d97e-b8d7-4844-a8c3-cc181f4c9909\n", + "14:00:22 INFO loader :: Add: 200,000 triples (Batch: 130,718 / Avg: 96,292)\n", + "14:00:23 INFO loader :: Add: 300,000 triples (Batch: 149,253 / Avg: 109,210)\n", + "14:00:24 WARN riot :: [line: 379426, col: 92] Bad IRI: Not a valid UUID string: uuid:UT19-CB-B-7641dbf9-c478-4252-b2ef-dc6fc4c2baf7\n", + "14:00:24 INFO loader :: Add: 400,000 triples (Batch: 142,247 / Avg: 115,942)\n", + "14:00:24 WARN riot :: [line: 405624, col: 92] Bad IRI: Not a valid UUID string: uuid:WREF-CB-T-b7c5dd99-510e-403a-a79e-d45e0af78fb3\n", + "14:00:24 WARN riot :: [line: 419433, col: 92] Bad IRI: Not a valid UUID string: uuid:NOGP-CB-T-8efdeada-35cc-48e2-a3d2-ea0ac76b142b\n", + "14:00:24 WARN riot :: [line: 455259, col: 92] Bad IRI: Not a valid UUID string: uuid:JERC-CB-B-cfba0afb-7a48-4edc-93f8-1e89b3025c3d\n", + "14:00:25 INFO loader :: Add: 500,000 triples (Batch: 141,643 / Avg: 120,307)\n", + "14:00:25 WARN riot :: [line: 529239, col: 92] Bad IRI: Not a valid UUID string: uuid:ORNL-CB-T-349b5bdb-2a1b-4e74-aa7f-a5c973c1d12c\n", + "14:00:25 INFO loader :: Add: 600,000 triples (Batch: 136,425 / Avg: 122,724)\n", + "14:00:26 WARN riot :: [line: 634277, col: 92] Bad IRI: Not a valid UUID string: uuid:CSF1-CB-B-1171add7-ad79-4c39-9e8f-977fdf0fc9e8\n", + "14:00:26 INFO loader :: Add: 700,000 triples (Batch: 131,406 / Avg: 123,893)\n", + "14:00:26 WARN riot :: [line: 728514, col: 92] Bad IRI: Not a valid UUID string: uuid:OCTU-CB-T-9991b6ee-c1d0-4a65-b20e-43bafd3fa050\n", + "14:00:27 INFO loader :: Add: 800,000 triples (Batch: 130,208 / Avg: 124,649)\n", + "14:00:27 WARN riot :: [line: 804889, col: 92] Bad IRI: Not a valid UUID string: uuid:BLAN-CB-B-a59c1a9c-e301-4f88-aff4-cf8f072d9cbc\n", + "14:00:27 WARN riot :: [line: 883370, col: 92] Bad IRI: Not a valid UUID string: uuid:UT12-CB-T-97b626b3-a280-4e40-8212-282db79cbc69\n", + "14:00:28 INFO loader :: Add: 900,000 triples (Batch: 138,312 / Avg: 126,032)\n", + "14:00:28 WARN riot :: [line: 910406, col: 92] Bad IRI: Not a valid UUID string: uuid:TALL-CB-T-b47d748c-de50-4d2b-8bb3-85bcc20a231b\n", + "14:00:28 WARN riot :: [line: 949977, col: 92] Bad IRI: Not a valid UUID string: uuid:UT23-CB-T-c70411c3-0a2f-43ae-a58d-204fb8c48c46\n", + "14:00:28 INFO loader :: Add: 1,000,000 triples (Batch: 146,412 / Avg: 127,811)\n", + "14:00:28 INFO loader :: Elapsed: 7.83 seconds [2024/03/14 14:00:28 UTC]\n", + "14:00:28 WARN riot :: [line: 1007405, col: 92] Bad IRI: Not a valid UUID string: uuid:PHTU-CB-T-3c3d8153-072e-4f66-b7f8-a7c2f9fed5df\n", + "14:00:28 WARN riot :: [line: 1011304, col: 92] Bad IRI: Not a valid UUID string: uuid:NWBB-CB-T-3af18d1b-014b-4e70-8d96-6920244ddb79\n", + "14:00:29 WARN riot :: [line: 1089771, col: 92] Bad IRI: Not a valid UUID string: uuid:WSU2-CB-T-f0728f3d-65c8-4075-9d17-47ce2e1de63c\n", + "14:00:29 INFO loader :: Add: 1,100,000 triples (Batch: 138,504 / Avg: 128,715)\n", + "14:00:29 WARN riot :: [line: 1124604, col: 92] Bad IRI: Not a valid UUID string: uuid:SRR1-CB-B-c7d1d4e9-aa53-4c64-88c7-8f8ed276a613\n", + "14:00:30 INFO loader :: Add: 1,200,000 triples (Batch: 142,450 / Avg: 129,757)\n", + "14:00:30 WARN riot :: [line: 1231739, col: 92] Bad IRI: Not a valid UUID string: uuid:OCTU-CB-B-c486e8fb-c960-4f2a-97a7-9bb87a1c7147\n", + "14:00:30 INFO loader :: Add: 1,300,000 triples (Batch: 137,551 / Avg: 130,325)\n", + "14:00:31 INFO loader :: Add: 1,400,000 triples (Batch: 140,056 / Avg: 130,975)\n", + "14:00:32 WARN riot :: [line: 1482192, col: 92] Bad IRI: Not a valid UUID string: uuid:JERC-CB-T-f3a0b7b4-b141-4080-9d42-394e5ba1c35c\n", + "14:00:32 INFO loader :: Add: 1,500,000 triples (Batch: 136,986 / Avg: 131,360)\n", + "14:00:32 WARN riot :: [line: 1517367, col: 92] Bad IRI: Not a valid UUID string: uuid:WSU1-CB-T-90f166a5-20ce-4c9b-b4a4-4a587395ffae\n", + "14:00:33 WARN riot :: [line: 1596102, col: 92] Bad IRI: Not a valid UUID string: uuid:KONA-CB-B-504c5931-f7cf-47aa-a2c0-ce90bcb3a0e3\n", + "14:00:33 INFO loader :: Add: 1,600,000 triples (Batch: 140,252 / Avg: 131,882)\n", + "14:00:33 WARN riot :: [line: 1679348, col: 92] Bad IRI: Not a valid UUID string: uuid:NOGP-CB-B-c51d20a5-51d1-4b16-92cc-d885700656c3\n", + "14:00:33 INFO loader :: Add: 1,700,000 triples (Batch: 131,406 / Avg: 131,854)\n", + "14:00:33 WARN riot :: [line: 1716151, col: 92] Bad IRI: Not a valid UUID string: uuid:WY09-CB-T-40d5b98a-b736-4509-9f9f-24d39a3e338b\n", + "14:00:34 INFO loader :: Add: 1,800,000 triples (Batch: 131,406 / Avg: 131,829)\n", + "14:00:35 WARN riot :: [line: 1881675, col: 92] Bad IRI: Not a valid UUID string: uuid:KONZ-CB-T-e70befb2-f3a7-4cd7-afac-ef817885fe44\n", + "14:00:35 INFO loader :: Add: 1,900,000 triples (Batch: 131,578 / Avg: 131,816)\n", + "14:00:35 WARN riot :: [line: 1902966, col: 92] Bad IRI: Not a valid UUID string: uuid:STEI-CB-T-73e26647-1f1e-43cc-84b0-2fd658886f7d\n", + "14:00:35 WARN riot :: [line: 1924282, col: 92] Bad IRI: Not a valid UUID string: uuid:JORN-CB-T-ec1b45a6-c27d-4fb2-8d1f-2b1978300df0\n", + "14:00:35 WARN riot :: [line: 1939427, col: 92] Bad IRI: Not a valid UUID string: uuid:DSNY-CB-T-16477917-d489-45ca-a926-5acc7eaca071\n", + "14:00:35 WARN riot :: [line: 1962768, col: 92] Bad IRI: Not a valid UUID string: uuid:WY10-CB-T-b6ac5210-79d5-4142-8756-4babab75fca0\n", + "14:00:36 INFO loader :: Add: 2,000,000 triples (Batch: 144,717 / Avg: 132,406)\n", + "14:00:36 INFO loader :: Elapsed: 15.11 seconds [2024/03/14 14:00:36 UTC]\n", + "14:00:36 WARN riot :: [line: 2057907, col: 92] Bad IRI: Not a valid UUID string: uuid:WLUP-CB-B-24cc2fd3-3d49-4d66-9edb-08cc7fce1450\n", + "14:00:36 WARN riot :: [line: 2060786, col: 92] Bad IRI: Not a valid UUID string: uuid:KONZ-CB-B-ea6e2ff0-c546-49dd-9f72-c762a6db55cb\n", + "14:00:36 INFO loader :: Add: 2,100,000 triples (Batch: 135,317 / Avg: 132,542)\n", + "14:00:36 WARN riot :: [line: 2123325, col: 92] Bad IRI: Not a valid UUID string: uuid:ISNC-CB-T-4eb8355f-18a1-4efc-8524-506e64f0937d\n", + "14:00:36 WARN riot :: [line: 2127985, col: 92] Bad IRI: Not a valid UUID string: uuid:PSR2-CB-T-177b015d-e006-4aca-977d-c9c118f6ec69\n", + "14:00:37 WARN riot :: [line: 2162561, col: 92] Bad IRI: Not a valid UUID string: uuid:OKPF-CB-B-d7417aa2-8ef2-4dfe-a655-4c535ec5a5b3\n", + "14:00:37 WARN riot :: [line: 2190360, col: 92] Bad IRI: Not a valid UUID string: uuid:HARV-CB-T-a1fbe98a-a761-4a91-b8fc-67c4e0d5d292\n", + "14:00:37 INFO loader :: Add: 2,200,000 triples (Batch: 130,890 / Avg: 132,466)\n", + "14:00:38 INFO loader :: Add: 2,300,000 triples (Batch: 130,208 / Avg: 132,366)\n", + "14:00:38 WARN riot :: [line: 2330079, col: 92] Bad IRI: Not a valid UUID string: uuid:WY01-CB-B-1db0d919-0432-46d3-8835-8c2ec8cf4132\n", + "14:00:39 INFO loader :: Add: 2,400,000 triples (Batch: 136,425 / Avg: 132,530)\n", + "14:00:39 WARN riot :: [line: 2402638, col: 92] Bad IRI: Not a valid UUID string: uuid:OCTB-CB-B-40166778-406d-4f49-bcad-4555c3e038a8\n", + "14:00:39 INFO loader :: Add: 2,500,000 triples (Batch: 128,040 / Avg: 132,345)\n", + "14:00:40 WARN riot :: [line: 2582320, col: 92] Bad IRI: Not a valid UUID string: uuid:JORN-CB-B-a2dfa872-c0d4-49fe-a752-f24be601c13a\n", + "14:00:40 INFO loader :: Add: 2,600,000 triples (Batch: 128,205 / Avg: 132,180)\n", + "14:00:40 WARN riot :: [line: 2647477, col: 92] Bad IRI: Not a valid UUID string: uuid:WSU3-CB-B-c248f426-b677-44df-919e-a9bd18e6fa7b\n", + "14:00:41 WARN riot :: [line: 2691875, col: 92] Bad IRI: Not a valid UUID string: uuid:PSR1-CB-B-60b61105-073e-4d04-b3dc-452479bbae4b\n", + "14:00:41 INFO loader :: Add: 2,700,000 triples (Batch: 136,612 / Avg: 132,339)\n", + "14:00:41 WARN riot :: [line: 2725698, col: 92] Bad IRI: Not a valid UUID string: uuid:SJER-CB-B-e7d303b7-60ac-426c-a2ad-4216273b3d4a\n", + "14:00:42 INFO loader :: Add: 2,800,000 triples (Batch: 128,205 / Avg: 132,187)\n", + "14:00:42 INFO loader :: Add: 2,900,000 triples (Batch: 130,718 / Avg: 132,136)\n", + "14:00:43 WARN riot :: [line: 2976196, col: 92] Bad IRI: Not a valid UUID string: uuid:CPER-CB-B-22f145ec-fb3a-4b6e-bc1d-8de45d79b5a1\n", + "14:00:43 WARN riot :: [line: 2983952, col: 92] Bad IRI: Not a valid UUID string: uuid:NWBA-CB-B-1bfcbda1-368e-4963-b502-698284f44440\n", + "14:00:43 WARN riot :: [line: 2986515, col: 92] Bad IRI: Not a valid UUID string: uuid:STEI-CB-B-d3aa5d85-ac22-4dfb-9133-8b10ee3aaa09\n", + "14:00:43 INFO loader :: Add: 3,000,000 triples (Batch: 142,247 / Avg: 132,450)\n", + "14:00:43 INFO loader :: Elapsed: 22.65 seconds [2024/03/14 14:00:43 UTC]\n", + "14:00:43 WARN riot :: [line: 3027910, col: 92] Bad IRI: Not a valid UUID string: uuid:PSR1-CB-T-b15f1f00-c26a-47d1-969f-91faf5944d58\n", + "14:00:44 INFO loader :: Add: 3,100,000 triples (Batch: 148,588 / Avg: 132,916)\n", + "14:00:44 WARN riot :: [line: 3133348, col: 92] Bad IRI: Not a valid UUID string: uuid:RMNP-CB-B-f0507935-ba93-42db-9f27-9383e7932ac3\n", + "14:00:44 WARN riot :: [line: 3145906, col: 92] Bad IRI: Not a valid UUID string: uuid:UT32-CB-T-38e515a2-ddec-41e0-be29-4fce953451fa\n", + "14:00:44 INFO loader :: Add: 3,200,000 triples (Batch: 140,449 / Avg: 133,139)\n", + "14:00:45 WARN riot :: [line: 3259737, col: 92] Bad IRI: Not a valid UUID string: uuid:TREE-CB-B-f5b3d5de-f4d5-4e09-94e5-cdd343d22787\n", + "14:00:45 WARN riot :: [line: 3280917, col: 92] Bad IRI: Not a valid UUID string: uuid:UT23-CB-B-e76880f1-deec-4076-9e1a-1377fd94849a\n", + "14:00:45 WARN riot :: [line: 3298996, col: 92] Bad IRI: Not a valid UUID string: uuid:OKPF-CB-T-08bb4fcb-2668-4d6f-a7eb-8f76a862abaf\n", + "14:00:45 INFO loader :: Add: 3,300,000 triples (Batch: 140,056 / Avg: 133,338)\n", + "14:00:45 WARN riot :: [line: 3307274, col: 92] Bad IRI: Not a valid UUID string: uuid:WREF-CB-B-36d4dd11-2249-4825-b781-2feb79da2c29\n", + "14:00:45 WARN riot :: [line: 3338884, col: 92] Bad IRI: Not a valid UUID string: uuid:ONAQ-CB-B-d38e0cda-aca4-4bfa-848b-c7f8ad2cc65f\n", + "14:00:45 WARN riot :: [line: 3348085, col: 92] Bad IRI: Not a valid UUID string: uuid:CLBJ-CB-B-dfc9535a-74f1-44dd-8aad-996b1fa20111\n", + "14:00:46 WARN riot :: [line: 3385267, col: 92] Bad IRI: Not a valid UUID string: uuid:NWBC-CB-B-06fd881f-fe3c-43f0-8f75-1fda0772ed35\n", + "14:00:46 INFO loader :: Add: 3,400,000 triples (Batch: 145,348 / Avg: 133,663)\n", + "14:00:46 WARN riot :: [line: 3409205, col: 92] Bad IRI: Not a valid UUID string: uuid:ISCC-CB-B-e95d2dcd-dd4f-42b7-9345-730dbfc2aa53\n", + "14:00:46 WARN riot :: [line: 3413117, col: 92] Bad IRI: Not a valid UUID string: uuid:NIWO-CB-B-56096a54-0d7a-4838-9a98-0d68d8d73e0e\n", + "14:00:46 WARN riot :: [line: 3428116, col: 92] Bad IRI: Not a valid UUID string: uuid:SRR1-CB-T-be47ce48-f9bc-4898-8b81-199ace9dc65f\n", + "14:00:46 WARN riot :: [line: 3431580, col: 92] Bad IRI: Not a valid UUID string: uuid:WSU1-CB-B-19e2cefa-49f2-499b-841d-33cc7ed8e6a7\n", + "14:00:46 WARN riot :: [line: 3487475, col: 92] Bad IRI: Not a valid UUID string: uuid:WY10-CB-B-87bbe689-217d-48d9-b6c0-7d4bf8f3540b\n", + "14:00:47 WARN riot :: [line: 3494953, col: 92] Bad IRI: Not a valid UUID string: uuid:SJER-CB-T-3b94b737-8ae6-452a-a916-4bac93f7a750\n", + "14:00:47 INFO loader :: Add: 3,500,000 triples (Batch: 138,121 / Avg: 133,786)\n", + "14:00:47 INFO loader :: Add: 3,600,000 triples (Batch: 139,082 / Avg: 133,928)\n", + "14:00:48 INFO loader :: Add: 3,700,000 triples (Batch: 138,121 / Avg: 134,038)\n", + "14:00:48 WARN riot :: [line: 3710348, col: 92] Bad IRI: Not a valid UUID string: uuid:KONA-CB-T-326ddf51-8af8-4d02-b946-c561ab741cae\n", + "14:00:48 WARN riot :: [line: 3718476, col: 92] Bad IRI: Not a valid UUID string: uuid:NWBB-CB-B-7355ff1a-b15d-4b06-b9d1-79a293e32030\n", + "14:00:48 WARN riot :: [line: 3746609, col: 92] Bad IRI: Not a valid UUID string: uuid:WY15-CB-T-44303230-0b7b-4367-b3e3-22c9f70ddbce\n", + "14:00:49 INFO loader :: Add: 3,800,000 triples (Batch: 145,985 / Avg: 134,327)\n", + "14:00:49 INFO loader :: Add: 3,900,000 triples (Batch: 129,198 / Avg: 134,191)\n", + "14:00:49 WARN riot :: [line: 3900622, col: 92] Bad IRI: Not a valid UUID string: uuid:WY03-CB-B-37aa5072-bb2c-40da-a03f-cbd4acb0f135\n", + "14:00:50 WARN riot :: [line: 3911717, col: 92] Bad IRI: Not a valid UUID string: uuid:DELA-CB-B-dc7411ef-453e-4b91-9885-a69fce891f8b\n", + "14:00:50 WARN riot :: [line: 3920547, col: 92] Bad IRI: Not a valid UUID string: uuid:ORNL-CB-B-502e74cf-2cca-4d0c-800c-dc0580d8b54d\n", + "14:00:50 WARN riot :: [line: 3983119, col: 92] Bad IRI: Not a valid UUID string: uuid:HARV-CB-B-c2af199e-fe66-41fc-93a9-50166b4c6bb5\n", + "14:00:50 INFO loader :: Add: 4,000,000 triples (Batch: 125,156 / Avg: 133,949)\n", + "14:00:50 INFO loader :: Elapsed: 29.86 seconds [2024/03/14 14:00:50 UTC]\n", + "14:00:51 WARN riot :: [line: 4035717, col: 92] Bad IRI: Not a valid UUID string: uuid:RMNP-CB-T-55f69c6a-3849-4e45-83ba-1cdfbefc3818\n", + "14:00:51 WARN riot :: [line: 4091475, col: 92] Bad IRI: Not a valid UUID string: uuid:WLLO-CB-B-68043fc5-8096-41a1-9df6-b67edd414c4f\n", + "14:00:51 WARN riot :: [line: 4093310, col: 92] Bad IRI: Not a valid UUID string: uuid:BLAN-CB-T-298f4fa4-a6ae-4cd1-8ac4-ed7b239a6abf\n", + "14:00:51 INFO loader :: Add: 4,100,000 triples (Batch: 133,511 / Avg: 133,938)\n", + "14:00:51 WARN riot :: [line: 4111505, col: 92] Bad IRI: Not a valid UUID string: uuid:WSU2-CA-B-6ad1d2a3-0949-423a-a549-c801913e1d63\n", + "14:00:51 WARN riot :: [line: 4119100, col: 92] Bad IRI: Not a valid UUID string: uuid:SRER-CB-B-62d593bb-711a-4eb2-8359-403de43c9ae5\n", + "14:00:51 WARN riot :: [line: 4138222, col: 92] Bad IRI: Not a valid UUID string: uuid:OAES-CB-B-f1f5c32d-8cfb-4d78-ac69-dd89510c8d03\n", + "14:00:52 INFO loader :: Add: 4,200,000 triples (Batch: 127,877 / Avg: 133,787)\n", + "14:00:52 WARN riot :: [line: 4214618, col: 92] Bad IRI: Not a valid UUID string: uuid:CSF1-CB-T-8e2cab37-6db5-4377-8eab-293a58400041\n", + "14:00:53 INFO loader :: Add: 4,300,000 triples (Batch: 127,226 / Avg: 133,627)\n", + "14:00:53 WARN riot :: [line: 4347839, col: 92] Bad IRI: Not a valid UUID string: uuid:SCBI-CB-T-e76a2f85-b6a1-4480-9c67-5ca8390b8399\n", + "14:00:53 INFO loader :: Add: 4,400,000 triples (Batch: 138,696 / Avg: 133,738)\n", + "14:00:53 WARN riot :: [line: 4414644, col: 92] Bad IRI: Not a valid UUID string: uuid:OSBS-CB-B-330d6d90-17b2-4452-ba83-6e870e8cad10\n", + "14:00:54 WARN riot :: [line: 4461514, col: 92] Bad IRI: Not a valid UUID string: uuid:PPRH-CB-B-40a7fdcf-7d5a-4d3d-ae81-42253fc94370\n", + "14:00:54 WARN riot :: [line: 4463847, col: 92] Bad IRI: Not a valid UUID string: uuid:WY01-CB-T-087448a0-5ec3-4d9e-8fea-1f3c06daabfb\n", + "14:00:54 WARN riot :: [line: 4479696, col: 92] Bad IRI: Not a valid UUID string: uuid:WLUP-CB-T-a07898a8-bff9-4bc2-ae4c-3456825b1a81\n", + "14:00:54 INFO loader :: Add: 4,500,000 triples (Batch: 140,252 / Avg: 133,876)\n", + "14:00:55 WARN riot :: [line: 4568525, col: 92] Bad IRI: Not a valid UUID string: uuid:UNDE-CB-B-f1c58c85-e8dd-40ce-9f2f-0f30f101562c\n", + "14:00:55 INFO loader :: Add: 4,600,000 triples (Batch: 141,043 / Avg: 134,024)\n", + "14:00:55 WARN riot :: [line: 4604641, col: 92] Bad IRI: Not a valid UUID string: uuid:UKFS-CB-T-9a091afa-d16e-48de-b98c-82521e3a95fa\n", + "14:00:55 WARN riot :: [line: 4651438, col: 92] Bad IRI: Not a valid UUID string: uuid:SRER-CB-T-534f50bf-ef66-4ed1-90f1-533ef4b52528\n", + "14:00:55 WARN riot :: [line: 4656728, col: 92] Bad IRI: Not a valid UUID string: uuid:WSU3-CB-T-4aaf3e2e-30af-4cb3-bfb3-e4cbeae25f62\n", + "14:00:55 WARN riot :: [line: 4659575, col: 92] Bad IRI: Not a valid UUID string: uuid:MLSB-CB-B-2d4aa025-e90e-4ce4-b3b5-fd7ff2bdf80b\n", + "14:00:55 WARN riot :: [line: 4663441, col: 92] Bad IRI: Not a valid UUID string: uuid:FTA3-CB-B-6c80279c-0ca3-4835-bfea-501e43fdc1a6\n", + "14:00:55 WARN riot :: [line: 4689541, col: 92] Bad IRI: Not a valid UUID string: uuid:ANZA-CB-B-6bc68e7d-7383-4bbe-9789-2f897ab426ca\n", + "14:00:55 WARN riot :: [line: 4698464, col: 92] Bad IRI: Not a valid UUID string: uuid:FTA5-CB-T-6930cad4-38f4-4eab-8b53-45805fbcc40d\n", + "14:00:55 INFO loader :: Add: 4,700,000 triples (Batch: 140,449 / Avg: 134,155)\n", + "14:00:56 WARN riot :: [line: 4719901, col: 92] Bad IRI: Not a valid UUID string: uuid:PETF-CB-T-cef6133d-59d9-44f6-8c22-29664736616d\n", + "14:00:56 WARN riot :: [line: 4728546, col: 92] Bad IRI: Not a valid UUID string: uuid:SERC-CB-B-a13e7129-c37b-449a-8172-bf04166ae49e\n", + "14:00:56 WARN riot :: [line: 4755762, col: 92] Bad IRI: Not a valid UUID string: uuid:SCBI-CB-B-120b7ecd-9603-4d23-9241-642a4ee95ddd\n", + "14:00:56 WARN riot :: [line: 4758278, col: 92] Bad IRI: Not a valid UUID string: uuid:TALL-CB-B-306ac187-5bcc-436a-9078-e0de4df5ad2e\n", + "14:00:56 WARN riot :: [line: 4787185, col: 92] Bad IRI: Not a valid UUID string: uuid:DSNY-CB-B-acabb4de-4e97-4820-8d28-07a5b48cc286\n", + "14:00:56 WARN riot :: [line: 4788794, col: 92] Bad IRI: Not a valid UUID string: uuid:OCTB-CB-T-3fe4e64c-6555-4cf9-a1a1-267a3efd36c5\n", + "14:00:56 INFO loader :: Add: 4,800,000 triples (Batch: 124,378 / Avg: 133,936)\n", + "14:00:56 WARN riot :: [line: 4830350, col: 92] Bad IRI: Not a valid UUID string: uuid:WY09-CB-B-cfcfdb9b-5d73-41ea-b105-f3fbe1d56c2b\n", + "14:00:57 WARN riot :: [line: 4866773, col: 92] Bad IRI: Not a valid UUID string: uuid:GRSM-CB-T-fe711015-d950-4b51-98a3-5308f92ec8c7\n", + "14:00:57 INFO loader :: Add: 4,900,000 triples (Batch: 123,304 / Avg: 133,700)\n", + "14:00:57 WARN riot :: [line: 4934891, col: 92] Bad IRI: Not a valid UUID string: uuid:CLBJ-CB-T-1f0d1871-90b7-445d-906d-71685a8ccae2\n", + "14:00:57 WARN riot :: [line: 4938825, col: 92] Bad IRI: Not a valid UUID string: uuid:MOAB-CB-T-40bbee20-6cee-4430-bc25-2a5f21ce31fb\n", + "14:00:58 INFO loader :: Add: 5,000,000 triples (Batch: 134,952 / Avg: 133,725)\n", + "14:00:58 INFO loader :: Elapsed: 37.39 seconds [2024/03/14 14:00:58 UTC]\n", + "14:00:58 WARN riot :: [line: 5064745, col: 92] Bad IRI: Not a valid UUID string: uuid:MOAB-CB-B-03a9d128-c9b5-4bf8-a6cc-0757a0c5af8a\n", + "14:00:59 INFO loader :: Add: 5,100,000 triples (Batch: 139,082 / Avg: 133,826)\n", + "14:00:59 WARN riot :: [line: 5175541, col: 92] Bad IRI: Not a valid UUID string: uuid:OSBS-CB-T-f2be224e-7356-4cd0-b585-4f408c30b59f\n", + "14:00:59 INFO loader :: Add: 5,200,000 triples (Batch: 130,718 / Avg: 133,765)\n", + "14:01:00 WARN riot :: [line: 5241487, col: 92] Bad IRI: Not a valid UUID string: uuid:WLLO-CB-T-ddc7635d-24f0-4456-8f9f-3680b08db779\n", + "14:01:00 INFO loader :: Add: 5,300,000 triples (Batch: 132,625 / Avg: 133,743)\n", + "14:01:01 WARN riot :: [line: 5363679, col: 92] Bad IRI: Not a valid UUID string: uuid:DCFS-CB-B-2357f0eb-9b01-4316-b676-dbc444bbef1d\n", + "14:01:01 INFO loader :: Add: 5,400,000 triples (Batch: 140,252 / Avg: 133,858)\n", + "14:01:01 WARN riot :: [line: 5405978, col: 92] Bad IRI: Not a valid UUID string: uuid:WOOD-CB-T-2db0e51c-90fa-481e-9432-61da1dfb5e15\n", + "14:01:01 WARN riot :: [line: 5408316, col: 92] Bad IRI: Not a valid UUID string: uuid:LENO-CB-T-a3029b48-f488-41b7-901f-8696e7c46046\n", + "14:01:01 WARN riot :: [line: 5494153, col: 92] Bad IRI: Not a valid UUID string: uuid:PSR2-CB-B-a9123b93-226f-4ac9-86f6-da9030cb4603\n", + "14:01:01 INFO loader :: Add: 5,500,000 triples (Batch: 132,802 / Avg: 133,839)\n", + "14:01:02 WARN riot :: [line: 5595482, col: 92] Bad IRI: Not a valid UUID string: uuid:ISCC-CB-T-5c5875b4-abd6-4226-b0ab-fdc9637ebb98\n", + "14:01:02 INFO loader :: Add: 5,600,000 triples (Batch: 137,362 / Avg: 133,900)\n", + "14:01:03 WARN riot :: [line: 5691357, col: 92] Bad IRI: Not a valid UUID string: uuid:DCFS-CB-T-4ea961ae-3e1d-4e7f-aca8-615d23195a1a\n", + "14:01:03 INFO loader :: Add: 5,700,000 triples (Batch: 141,242 / Avg: 134,023)\n", + "14:01:03 WARN riot :: [line: 5726539, col: 92] Bad IRI: Not a valid UUID string: uuid:ONAQ-CB-T-3294fa6a-beb1-4bb2-a4b3-e2e13b50bcc3\n", + "14:01:03 WARN riot :: [line: 5741650, col: 92] Bad IRI: Not a valid UUID string: uuid:WY03-CB-T-2d5a8790-1c24-4059-97e9-668b51bf544d\n", + "14:01:03 WARN riot :: [line: 5770933, col: 92] Bad IRI: Not a valid UUID string: uuid:NWBC-CB-T-1ab40003-8f10-4a2a-9ba1-911200388f64\n", + "14:01:04 WARN riot :: [line: 5789020, col: 92] Bad IRI: Not a valid UUID string: uuid:SOAP-CB-B-3b4186ac-3481-4959-82a6-420961ffb8c3\n", + "14:01:04 INFO loader :: Add: 5,800,000 triples (Batch: 132,625 / Avg: 133,998)\n", + "14:01:04 WARN riot :: [line: 5832039, col: 92] Bad IRI: Not a valid UUID string: uuid:ISNC-CB-B-41c7672d-1b29-40e8-af49-137b03473cec\n", + "14:01:04 WARN riot :: [line: 5853245, col: 92] Bad IRI: Not a valid UUID string: uuid:ANZA-CB-T-a11a1232-abdc-4192-b792-f10d418ad550\n", + "14:01:04 WARN riot :: [line: 5892676, col: 92] Bad IRI: Not a valid UUID string: uuid:UKFS-CB-B-18631570-14bc-4401-80e0-85559c5eb037\n", + "14:01:04 INFO loader :: Add: 5,900,000 triples (Batch: 131,752 / Avg: 133,959)\n", + "14:01:05 WARN riot :: [line: 5913742, col: 92] Bad IRI: Not a valid UUID string: uuid:WY15-CB-B-e7803387-76a7-45de-8ea1-d6c4146b8a81\n", + "14:01:05 WARN riot :: [line: 5966676, col: 92] Bad IRI: Not a valid UUID string: uuid:PHTU-CB-B-c68c9d04-9501-416d-9150-f647665fe15b\n", + "14:01:05 WARN riot :: [line: 5972247, col: 92] Bad IRI: Not a valid UUID string: uuid:FTA3-CB-T-4116d3f6-5dfe-45c0-9989-6a6489238fbf\n", + "14:01:05 INFO loader :: Add: 6,000,000 triples (Batch: 133,155 / Avg: 133,946)\n", + "14:01:05 INFO loader :: Elapsed: 44.80 seconds [2024/03/14 14:01:05 UTC]\n", + "14:01:05 WARN riot :: [line: 6032950, col: 92] Bad IRI: Not a valid UUID string: uuid:PETF-CB-B-9196012d-042a-47b4-986a-21ead1040e31\n", + "14:01:06 WARN riot :: [line: 6042969, col: 92] Bad IRI: Not a valid UUID string: uuid:CPER-CB-T-b8027938-7ff4-46c1-8575-e23584f1e898\n", + "14:01:06 WARN riot :: [line: 6067813, col: 92] Bad IRI: Not a valid UUID string: uuid:NIWO-CB-T-8fa5540a-3ecc-43f3-8dea-9c9372d12826\n", + "14:01:06 WARN riot :: [line: 6076212, col: 92] Bad IRI: Not a valid UUID string: uuid:NWBA-CB-T-9c9b1f6c-5d5a-4c4a-a2bb-ab7c379844ee\n", + "14:01:06 INFO loader :: Add: 6,100,000 triples (Batch: 134,589 / Avg: 133,957)\n", + "14:01:07 WARN riot :: [line: 6176858, col: 92] Bad IRI: Not a valid UUID string: uuid:SERC-CB-T-a591f903-459e-409e-b1e8-f1a213d60bce\n", + "14:01:07 INFO loader :: Add: 6,200,000 triples (Batch: 131,926 / Avg: 133,923)\n", + "14:01:07 WARN riot :: [line: 6209858, col: 92] Bad IRI: Not a valid UUID string: uuid:WOOD-CB-B-8e2da002-85ce-474f-a13d-561087b17bd6\n", + "14:01:07 WARN riot :: [line: 6215167, col: 92] Bad IRI: Not a valid UUID string: uuid:UT12-CB-B-84465a7e-6d15-4aad-9d64-5345cb857450\n", + "14:01:07 WARN riot :: [line: 6261067, col: 92] Bad IRI: Not a valid UUID string: uuid:UNDE-CB-T-f15c6148-9ebf-41d1-812b-7ba7c15d9d96\n", + "14:01:07 WARN riot :: [line: 6284399, col: 92] Bad IRI: Not a valid UUID string: uuid:UT19-CB-T-cc59032d-d8e6-4da3-95eb-df9b2ce3d7e3\n", + "14:01:07 WARN riot :: [line: 6297155, col: 92] Bad IRI: Not a valid UUID string: uuid:SOAP-CB-T-e0235661-d3be-4195-bab0-105df1d4b9b8\n", + "14:01:07 INFO loader :: Add: 6,300,000 triples (Batch: 137,551 / Avg: 133,979)\n", + "14:01:08 INFO loader :: Add: 6,400,000 triples (Batch: 144,927 / Avg: 134,138)\n", + "14:01:08 WARN riot :: [line: 6415452, col: 92] Bad IRI: Not a valid UUID string: uuid:TREE-CB-T-03bc7e4f-a834-4d78-b733-e3d863ea640f\n", + "14:01:08 WARN riot :: [line: 6438845, col: 92] Bad IRI: Not a valid UUID string: uuid:LENO-CB-B-26b5381f-14b1-401e-a729-4e6d1b421c37\n", + "14:01:09 WARN riot :: [line: 6476158, col: 92] Bad IRI: Not a valid UUID string: uuid:CSF2-CB-B-c99abe38-cbf0-40d1-864a-6fb848212e8b\n", + "14:01:09 INFO loader :: Add: 6,500,000 triples (Batch: 130,039 / Avg: 134,073)\n", + "14:01:09 WARN riot :: [line: 6518173, col: 92] Bad IRI: Not a valid UUID string: uuid:GRSM-CB-B-a59d0367-64bc-4f5e-929d-805ff0b1bc0d\n", + "14:01:09 WARN riot :: [line: 6538477, col: 92] Bad IRI: Not a valid UUID string: uuid:FTA5-CB-B-f450aea0-e6fb-4596-9f8b-56dba4635a8b\n", + "14:01:09 WARN riot :: [line: 6538705, col: 92] Bad IRI: Not a valid UUID string: uuid:UT32-CB-B-145460ed-1b7d-4ee1-a90d-76f0b6ef62e8\n", + "14:01:09 WARN riot :: [line: 6566925, col: 92] Bad IRI: Not a valid UUID string: uuid:OAES-CB-T-f7a2632b-871d-4276-92c5-d8e3453d0221\n", + "14:01:10 INFO loader :: Add: 6,600,000 triples (Batch: 142,045 / Avg: 134,187)\n", + "14:01:10 WARN riot :: [line: 6643819, col: 92] Bad IRI: Not a valid UUID string: uuid:PPRH-CB-T-a18a96d5-cf4b-4707-ab31-c605aeddbfdf\n", + "14:01:10 INFO loader :: Add: 6,700,000 triples (Batch: 147,058 / Avg: 134,362)\n", + "14:01:11 WARN riot :: [line: 6776492, col: 92] Bad IRI: Not a valid UUID string: uuid:MLSB-CB-T-2ca22db1-1704-4174-9a45-69cc790ce9a5\n", + "14:01:11 INFO loader :: Add: 6,800,000 triples (Batch: 138,312 / Avg: 134,419)\n", + "14:01:11 WARN riot :: [line: 6801150, col: 92] Bad IRI: Not a valid UUID string: uuid:TEAK-CB-B-feb28ff2-0e1f-4be9-bd9a-f92521bd9d90\n", + "14:01:12 INFO loader :: Add: 6,900,000 triples (Batch: 139,082 / Avg: 134,484)\n", + "14:01:12 INFO loader :: -- Finish triples data phase\n", + "14:01:12 INFO loader :: ** Data: 6,991,623 triples loaded in 51.93 seconds [Rate: 134,635.53 per second]\n", + "14:01:12 INFO loader :: -- Finish quads data phase\n", + "14:01:12 INFO loader :: -- Start triples index phase\n", + "14:01:12 INFO loader :: Index SPO->POS: 100,000 slots (Batch: 1,000,000 slots/s / Avg: 1,000,000 slots/s)\n", + "14:01:13 INFO loader :: Index SPO->POS: 200,000 slots (Batch: 1,250,000 slots/s / Avg: 1,111,111 slots/s)\n", + "14:01:13 INFO loader :: Index SPO->POS: 300,000 slots (Batch: 1,219,512 slots/s / Avg: 1,145,038 slots/s)\n", + "14:01:13 INFO loader :: Index SPO->POS: 400,000 slots (Batch: 934,579 slots/s / Avg: 1,084,010 slots/s)\n", + "14:01:13 INFO loader :: Index SPO->POS: 500,000 slots (Batch: 1,111,111 slots/s / Avg: 1,089,324 slots/s)\n", + "14:01:13 INFO loader :: Index SPO->POS: 600,000 slots (Batch: 1,176,470 slots/s / Avg: 1,102,941 slots/s)\n", + "14:01:13 INFO loader :: Index SPO->POS: 700,000 slots (Batch: 909,090 slots/s / Avg: 1,070,336 slots/s)\n", + "14:01:13 INFO loader :: Index SPO->POS: 800,000 slots (Batch: 1,190,476 slots/s / Avg: 1,084,010 slots/s)\n", + "14:01:13 INFO loader :: Index SPO->POS: 900,000 slots (Batch: 1,162,790 slots/s / Avg: 1,092,233 slots/s)\n", + "14:01:13 INFO loader :: Index SPO->POS: 1,000,000 slots (Batch: 1,123,595 slots/s / Avg: 1,095,290 slots/s)\n", + "14:01:13 INFO loader :: Elapsed: 52.85 seconds [2024/03/14 14:01:13 UTC]\n", + "14:01:13 INFO loader :: Index SPO->POS: 1,100,000 slots (Batch: 877,192 slots/s / Avg: 1,071,080 slots/s)\n", + "14:01:13 INFO loader :: Index SPO->POS: 1,200,000 slots (Batch: 1,123,595 slots/s / Avg: 1,075,268 slots/s)\n", + "14:01:14 INFO loader :: Index SPO->POS: 1,300,000 slots (Batch: 1,111,111 slots/s / Avg: 1,077,943 slots/s)\n", + "14:01:14 INFO loader :: Index SPO->POS: 1,400,000 slots (Batch: 884,955 slots/s / Avg: 1,061,410 slots/s)\n", + "14:01:14 INFO loader :: Index SPO->POS: 1,500,000 slots (Batch: 1,041,666 slots/s / Avg: 1,060,070 slots/s)\n", + "14:01:14 INFO loader :: Index SPO->POS: 1,600,000 slots (Batch: 1,075,268 slots/s / Avg: 1,061,007 slots/s)\n", + "14:01:14 INFO loader :: Index SPO->POS: 1,700,000 slots (Batch: 970,873 slots/s / Avg: 1,055,245 slots/s)\n", + "14:01:14 INFO loader :: Index SPO->POS: 1,800,000 slots (Batch: 840,336 slots/s / Avg: 1,040,462 slots/s)\n", + "14:01:14 INFO loader :: Index SPO->POS: 1,900,000 slots (Batch: 1,030,927 slots/s / Avg: 1,039,956 slots/s)\n", + "14:01:14 INFO loader :: Index SPO->POS: 2,000,000 slots (Batch: 1,030,927 slots/s / Avg: 1,039,501 slots/s)\n", + "14:01:14 INFO loader :: Elapsed: 53.86 seconds [2024/03/14 14:01:14 UTC]\n", + "14:01:14 INFO loader :: Index SPO->POS: 2,100,000 slots (Batch: 781,250 slots/s / Avg: 1,023,391 slots/s)\n", + "14:01:14 INFO loader :: Index SPO->POS: 2,200,000 slots (Batch: 925,925 slots/s / Avg: 1,018,518 slots/s)\n", + "14:01:15 INFO loader :: Index SPO->POS: 2,300,000 slots (Batch: 990,099 slots/s / Avg: 1,017,249 slots/s)\n", + "14:01:15 INFO loader :: Index SPO->POS: 2,400,000 slots (Batch: 925,925 slots/s / Avg: 1,013,085 slots/s)\n", + "14:01:15 INFO loader :: Index SPO->POS: 2,500,000 slots (Batch: 787,401 slots/s / Avg: 1,001,602 slots/s)\n", + "14:01:15 INFO loader :: Index SPO->POS: 2,600,000 slots (Batch: 892,857 slots/s / Avg: 996,932 slots/s)\n", + "14:01:15 INFO loader :: Index SPO->POS: 2,700,000 slots (Batch: 943,396 slots/s / Avg: 994,841 slots/s)\n", + "14:01:15 INFO loader :: Index SPO->POS: 2,800,000 slots (Batch: 980,392 slots/s / Avg: 994,318 slots/s)\n", + "14:01:15 INFO loader :: Index SPO->POS: 2,900,000 slots (Batch: 793,650 slots/s / Avg: 985,723 slots/s)\n", + "14:01:15 INFO loader :: Index SPO->POS: 3,000,000 slots (Batch: 884,955 slots/s / Avg: 981,996 slots/s)\n", + "14:01:15 INFO loader :: Elapsed: 54.99 seconds [2024/03/14 14:01:15 UTC]\n", + "14:01:16 INFO loader :: Index SPO->POS: 3,100,000 slots (Batch: 884,955 slots/s / Avg: 978,535 slots/s)\n", + "14:01:16 INFO loader :: Index SPO->POS: 3,200,000 slots (Batch: 943,396 slots/s / Avg: 977,397 slots/s)\n", + "14:01:16 INFO loader :: Index SPO->POS: 3,300,000 slots (Batch: 729,927 slots/s / Avg: 967,458 slots/s)\n", + "14:01:16 INFO loader :: Index SPO->POS: 3,400,000 slots (Batch: 869,565 slots/s / Avg: 964,265 slots/s)\n", + "14:01:16 INFO loader :: Index SPO->POS: 3,500,000 slots (Batch: 847,457 slots/s / Avg: 960,482 slots/s)\n", + "14:01:16 INFO loader :: Index SPO->POS: 3,600,000 slots (Batch: 917,431 slots/s / Avg: 959,232 slots/s)\n", + "14:01:16 INFO loader :: Index SPO->POS: 3,700,000 slots (Batch: 775,193 slots/s / Avg: 953,116 slots/s)\n", + "14:01:16 INFO loader :: Index SPO->POS: 3,800,000 slots (Batch: 934,579 slots/s / Avg: 952,619 slots/s)\n", + "14:01:16 INFO loader :: Index SPO->POS: 3,900,000 slots (Batch: 909,090 slots/s / Avg: 951,451 slots/s)\n", + "14:01:17 INFO loader :: Index SPO->POS: 4,000,000 slots (Batch: 746,268 slots/s / Avg: 944,956 slots/s)\n", + "14:01:17 INFO loader :: Elapsed: 56.17 seconds [2024/03/14 14:01:17 UTC]\n", + "14:01:17 INFO loader :: Index SPO->POS: 4,100,000 slots (Batch: 900,900 slots/s / Avg: 943,830 slots/s)\n", + "14:01:17 INFO loader :: Index SPO->POS: 4,200,000 slots (Batch: 900,900 slots/s / Avg: 942,760 slots/s)\n", + "14:01:17 INFO loader :: Index SPO->POS: 4,300,000 slots (Batch: 892,857 slots/s / Avg: 941,537 slots/s)\n", + "14:01:17 INFO loader :: Index SPO->POS: 4,400,000 slots (Batch: 793,650 slots/s / Avg: 937,566 slots/s)\n", + "14:01:17 INFO loader :: Index SPO->POS: 4,500,000 slots (Batch: 884,955 slots/s / Avg: 936,329 slots/s)\n", + "14:01:17 INFO loader :: Index SPO->POS: 4,600,000 slots (Batch: 900,900 slots/s / Avg: 935,529 slots/s)\n", + "14:01:17 INFO loader :: Index SPO->POS: 4,700,000 slots (Batch: 892,857 slots/s / Avg: 934,579 slots/s)\n", + "14:01:17 INFO loader :: Index SPO->POS: 4,800,000 slots (Batch: 787,401 slots/s / Avg: 930,954 slots/s)\n", + "14:01:18 INFO loader :: Index SPO->POS: 4,900,000 slots (Batch: 892,857 slots/s / Avg: 930,144 slots/s)\n", + "14:01:18 INFO loader :: Index SPO->POS: 5,000,000 slots (Batch: 862,068 slots/s / Avg: 928,677 slots/s)\n", + "14:01:18 INFO loader :: Elapsed: 57.32 seconds [2024/03/14 14:01:18 UTC]\n", + "14:01:18 INFO loader :: Index SPO->POS: 5,100,000 slots (Batch: 833,333 slots/s / Avg: 926,598 slots/s)\n", + "14:01:18 INFO loader :: Index SPO->POS: 5,200,000 slots (Batch: 787,401 slots/s / Avg: 923,459 slots/s)\n", + "14:01:18 INFO loader :: Index SPO->POS: 5,300,000 slots (Batch: 884,955 slots/s / Avg: 922,701 slots/s)\n", + "14:01:18 INFO loader :: Index SPO->POS: 5,400,000 slots (Batch: 877,192 slots/s / Avg: 921,816 slots/s)\n", + "14:01:18 INFO loader :: Index SPO->POS: 5,500,000 slots (Batch: 862,068 slots/s / Avg: 920,656 slots/s)\n", + "14:01:18 INFO loader :: Index SPO->POS: 5,600,000 slots (Batch: 781,250 slots/s / Avg: 917,731 slots/s)\n", + "14:01:19 INFO loader :: Index SPO->POS: 5,700,000 slots (Batch: 806,451 slots/s / Avg: 915,515 slots/s)\n", + "14:01:19 INFO loader :: Index SPO->POS: 5,800,000 slots (Batch: 847,457 slots/s / Avg: 914,249 slots/s)\n", + "14:01:19 INFO loader :: Index SPO->POS: 5,900,000 slots (Batch: 806,451 slots/s / Avg: 912,183 slots/s)\n", + "14:01:19 INFO loader :: Index SPO->POS: 6,000,000 slots (Batch: 787,401 slots/s / Avg: 909,780 slots/s)\n", + "14:01:19 INFO loader :: Elapsed: 58.53 seconds [2024/03/14 14:01:19 UTC]\n", + "14:01:19 INFO loader :: Index SPO->POS: 6,100,000 slots (Batch: 800,000 slots/s / Avg: 907,738 slots/s)\n", + "14:01:19 INFO loader :: Index SPO->POS: 6,200,000 slots (Batch: 833,333 slots/s / Avg: 906,432 slots/s)\n", + "14:01:19 INFO loader :: Index SPO->POS: 6,300,000 slots (Batch: 826,446 slots/s / Avg: 905,042 slots/s)\n", + "14:01:19 INFO loader :: Index SPO->POS: 6,400,000 slots (Batch: 833,333 slots/s / Avg: 903,827 slots/s)\n", + "14:01:20 INFO loader :: Index SPO->POS: 6,500,000 slots (Batch: 826,446 slots/s / Avg: 902,527 slots/s)\n", + "14:01:20 INFO loader :: Index SPO->POS: 6,600,000 slots (Batch: 813,008 slots/s / Avg: 901,023 slots/s)\n", + "14:01:20 INFO loader :: Index SPO->POS: 6,700,000 slots (Batch: 769,230 slots/s / Avg: 898,725 slots/s)\n", + "14:01:20 INFO loader :: Index SPO->POS: 6,800,000 slots (Batch: 769,230 slots/s / Avg: 896,506 slots/s)\n", + "14:01:20 INFO loader :: Index SPO->POS: 6,900,000 slots (Batch: 793,650 slots/s / Avg: 894,825 slots/s)\n", + "14:01:20 INFO loader :: ** Index SPO->POS: 6,991,623 slots indexed in 7.82 seconds [Rate: 894,641.50 per second]\n", + "14:01:20 INFO loader :: Index SPO->OSP: 100,000 slots (Batch: 1,923,076 slots/s / Avg: 1,923,076 slots/s)\n", + "14:01:20 INFO loader :: Index SPO->OSP: 200,000 slots (Batch: 1,562,500 slots/s / Avg: 1,724,137 slots/s)\n", + "14:01:20 INFO loader :: Index SPO->OSP: 300,000 slots (Batch: 1,562,500 slots/s / Avg: 1,666,666 slots/s)\n", + "14:01:20 INFO loader :: Index SPO->OSP: 400,000 slots (Batch: 1,470,588 slots/s / Avg: 1,612,903 slots/s)\n", + "14:01:20 INFO loader :: Index SPO->OSP: 500,000 slots (Batch: 1,298,701 slots/s / Avg: 1,538,461 slots/s)\n", + "14:01:21 INFO loader :: Index SPO->OSP: 600,000 slots (Batch: 1,470,588 slots/s / Avg: 1,526,717 slots/s)\n", + "14:01:21 INFO loader :: Index SPO->OSP: 700,000 slots (Batch: 1,408,450 slots/s / Avg: 1,508,620 slots/s)\n", + "14:01:21 INFO loader :: Index SPO->OSP: 800,000 slots (Batch: 1,250,000 slots/s / Avg: 1,470,588 slots/s)\n", + "14:01:21 INFO loader :: Index SPO->OSP: 900,000 slots (Batch: 1,250,000 slots/s / Avg: 1,442,307 slots/s)\n", + "14:01:21 INFO loader :: Index SPO->OSP: 1,000,000 slots (Batch: 1,250,000 slots/s / Avg: 1,420,454 slots/s)\n", + "14:01:21 INFO loader :: Elapsed: 60.46 seconds [2024/03/14 14:01:21 UTC]\n", + "14:01:21 INFO loader :: Index SPO->OSP: 1,100,000 slots (Batch: 1,111,111 slots/s / Avg: 1,385,390 slots/s)\n", + "14:01:21 INFO loader :: Index SPO->OSP: 1,200,000 slots (Batch: 1,176,470 slots/s / Avg: 1,365,187 slots/s)\n", + "14:01:21 INFO loader :: Index SPO->OSP: 1,300,000 slots (Batch: 1,234,567 slots/s / Avg: 1,354,166 slots/s)\n", + "14:01:21 INFO loader :: Index SPO->OSP: 1,400,000 slots (Batch: 1,234,567 slots/s / Avg: 1,344,860 slots/s)\n", + "14:01:21 INFO loader :: Index SPO->OSP: 1,500,000 slots (Batch: 1,190,476 slots/s / Avg: 1,333,333 slots/s)\n", + "14:01:21 INFO loader :: Index SPO->OSP: 1,600,000 slots (Batch: 1,149,425 slots/s / Avg: 1,320,132 slots/s)\n", + "14:01:21 INFO loader :: Index SPO->OSP: 1,700,000 slots (Batch: 1,136,363 slots/s / Avg: 1,307,692 slots/s)\n", + "14:01:22 INFO loader :: Index SPO->OSP: 1,800,000 slots (Batch: 1,136,363 slots/s / Avg: 1,296,829 slots/s)\n", + "14:01:22 INFO loader :: Index SPO->OSP: 1,900,000 slots (Batch: 1,136,363 slots/s / Avg: 1,287,262 slots/s)\n", + "14:01:22 INFO loader :: Index SPO->OSP: 2,000,000 slots (Batch: 1,149,425 slots/s / Avg: 1,279,590 slots/s)\n", + "14:01:22 INFO loader :: Elapsed: 61.31 seconds [2024/03/14 14:01:22 UTC]\n", + "14:01:22 INFO loader :: Index SPO->OSP: 2,100,000 slots (Batch: 1,041,666 slots/s / Avg: 1,265,822 slots/s)\n", + "14:01:22 INFO loader :: Index SPO->OSP: 2,200,000 slots (Batch: 1,063,829 slots/s / Avg: 1,254,991 slots/s)\n", + "14:01:22 INFO loader :: Index SPO->OSP: 2,300,000 slots (Batch: 1,086,956 slots/s / Avg: 1,246,612 slots/s)\n", + "14:01:22 INFO loader :: Index SPO->OSP: 2,400,000 slots (Batch: 1,136,363 slots/s / Avg: 1,241,593 slots/s)\n", + "14:01:22 INFO loader :: Index SPO->OSP: 2,500,000 slots (Batch: 1,098,901 slots/s / Avg: 1,235,177 slots/s)\n", + "14:01:22 INFO loader :: Index SPO->OSP: 2,600,000 slots (Batch: 1,123,595 slots/s / Avg: 1,230,477 slots/s)\n", + "14:01:22 INFO loader :: Index SPO->OSP: 2,700,000 slots (Batch: 1,075,268 slots/s / Avg: 1,223,934 slots/s)\n", + "14:01:22 INFO loader :: Index SPO->OSP: 2,800,000 slots (Batch: 1,075,268 slots/s / Avg: 1,217,920 slots/s)\n", + "14:01:23 INFO loader :: Index SPO->OSP: 2,900,000 slots (Batch: 1,086,956 slots/s / Avg: 1,212,881 slots/s)\n", + "14:01:23 INFO loader :: Index SPO->OSP: 3,000,000 slots (Batch: 1,075,268 slots/s / Avg: 1,207,729 slots/s)\n", + "14:01:23 INFO loader :: Elapsed: 62.24 seconds [2024/03/14 14:01:23 UTC]\n", + "14:01:23 INFO loader :: Index SPO->OSP: 3,100,000 slots (Batch: 1,063,829 slots/s / Avg: 1,202,482 slots/s)\n", + "14:01:23 INFO loader :: Index SPO->OSP: 3,200,000 slots (Batch: 990,099 slots/s / Avg: 1,194,475 slots/s)\n", + "14:01:23 INFO loader :: Index SPO->OSP: 3,300,000 slots (Batch: 1,000,000 slots/s / Avg: 1,187,477 slots/s)\n", + "14:01:23 INFO loader :: Index SPO->OSP: 3,400,000 slots (Batch: 1,030,927 slots/s / Avg: 1,182,197 slots/s)\n", + "14:01:23 INFO loader :: Index SPO->OSP: 3,500,000 slots (Batch: 961,538 slots/s / Avg: 1,174,496 slots/s)\n", + "14:01:23 INFO loader :: Index SPO->OSP: 3,600,000 slots (Batch: 1,041,666 slots/s / Avg: 1,170,351 slots/s)\n", + "14:01:23 INFO loader :: Index SPO->OSP: 3,700,000 slots (Batch: 1,030,927 slots/s / Avg: 1,166,088 slots/s)\n", + "14:01:23 INFO loader :: Index SPO->OSP: 3,800,000 slots (Batch: 1,030,927 slots/s / Avg: 1,162,079 slots/s)\n", + "14:01:24 INFO loader :: Index SPO->OSP: 3,900,000 slots (Batch: 934,579 slots/s / Avg: 1,154,871 slots/s)\n", + "14:01:24 INFO loader :: Index SPO->OSP: 4,000,000 slots (Batch: 952,380 slots/s / Avg: 1,148,765 slots/s)\n", + "14:01:24 INFO loader :: Elapsed: 63.23 seconds [2024/03/14 14:01:24 UTC]\n", + "14:01:24 INFO loader :: Index SPO->OSP: 4,100,000 slots (Batch: 1,000,000 slots/s / Avg: 1,144,611 slots/s)\n", + "14:01:24 INFO loader :: Index SPO->OSP: 4,200,000 slots (Batch: 1,010,101 slots/s / Avg: 1,140,994 slots/s)\n", + "14:01:24 INFO loader :: Index SPO->OSP: 4,300,000 slots (Batch: 952,380 slots/s / Avg: 1,135,763 slots/s)\n", + "14:01:24 INFO loader :: Index SPO->OSP: 4,400,000 slots (Batch: 970,873 slots/s / Avg: 1,131,396 slots/s)\n", + "14:01:24 INFO loader :: Index SPO->OSP: 4,500,000 slots (Batch: 1,010,101 slots/s / Avg: 1,128,385 slots/s)\n", + "14:01:24 INFO loader :: Index SPO->OSP: 4,600,000 slots (Batch: 1,010,101 slots/s / Avg: 1,125,519 slots/s)\n", + "14:01:24 INFO loader :: Index SPO->OSP: 4,700,000 slots (Batch: 925,925 slots/s / Avg: 1,120,381 slots/s)\n", + "14:01:24 INFO loader :: Index SPO->OSP: 4,800,000 slots (Batch: 925,925 slots/s / Avg: 1,115,500 slots/s)\n", + "14:01:25 INFO loader :: Index SPO->OSP: 4,900,000 slots (Batch: 970,873 slots/s / Avg: 1,112,119 slots/s)\n", + "14:01:25 INFO loader :: Index SPO->OSP: 5,000,000 slots (Batch: 961,538 slots/s / Avg: 1,108,647 slots/s)\n", + "14:01:25 INFO loader :: Elapsed: 64.26 seconds [2024/03/14 14:01:25 UTC]\n", + "14:01:25 INFO loader :: Index SPO->OSP: 5,100,000 slots (Batch: 980,392 slots/s / Avg: 1,105,810 slots/s)\n", + "14:01:25 INFO loader :: Index SPO->OSP: 5,200,000 slots (Batch: 925,925 slots/s / Avg: 1,101,694 slots/s)\n", + "14:01:25 INFO loader :: Index SPO->OSP: 5,300,000 slots (Batch: 934,579 slots/s / Avg: 1,097,990 slots/s)\n", + "14:01:25 INFO loader :: Index SPO->OSP: 5,400,000 slots (Batch: 970,873 slots/s / Avg: 1,095,334 slots/s)\n", + "14:01:25 INFO loader :: Index SPO->OSP: 5,500,000 slots (Batch: 980,392 slots/s / Avg: 1,093,004 slots/s)\n", + "14:01:25 INFO loader :: Index SPO->OSP: 5,600,000 slots (Batch: 970,873 slots/s / Avg: 1,090,555 slots/s)\n", + "14:01:25 INFO loader :: Index SPO->OSP: 5,700,000 slots (Batch: 980,392 slots/s / Avg: 1,088,409 slots/s)\n", + "14:01:26 INFO loader :: Index SPO->OSP: 5,800,000 slots (Batch: 892,857 slots/s / Avg: 1,084,314 slots/s)\n", + "14:01:26 INFO loader :: Index SPO->OSP: 5,900,000 slots (Batch: 934,579 slots/s / Avg: 1,081,378 slots/s)\n", + "14:01:26 INFO loader :: Index SPO->OSP: 6,000,000 slots (Batch: 970,873 slots/s / Avg: 1,079,330 slots/s)\n", + "14:01:26 INFO loader :: Elapsed: 65.31 seconds [2024/03/14 14:01:26 UTC]\n", + "14:01:26 INFO loader :: Index SPO->OSP: 6,100,000 slots (Batch: 900,900 slots/s / Avg: 1,075,837 slots/s)\n", + "14:01:26 INFO loader :: Index SPO->OSP: 6,200,000 slots (Batch: 909,090 slots/s / Avg: 1,072,664 slots/s)\n", + "14:01:26 INFO loader :: Index SPO->OSP: 6,300,000 slots (Batch: 900,900 slots/s / Avg: 1,069,427 slots/s)\n", + "14:01:26 INFO loader :: Index SPO->OSP: 6,400,000 slots (Batch: 961,538 slots/s / Avg: 1,067,556 slots/s)\n", + "14:01:26 INFO loader :: Index SPO->OSP: 6,500,000 slots (Batch: 970,873 slots/s / Avg: 1,065,923 slots/s)\n", + "14:01:26 INFO loader :: Index SPO->OSP: 6,600,000 slots (Batch: 970,873 slots/s / Avg: 1,064,344 slots/s)\n", + "14:01:26 INFO loader :: Index SPO->OSP: 6,700,000 slots (Batch: 884,955 slots/s / Avg: 1,061,133 slots/s)\n", + "14:01:27 INFO loader :: Index SPO->OSP: 6,800,000 slots (Batch: 925,925 slots/s / Avg: 1,058,860 slots/s)\n", + "14:01:27 INFO loader :: Index SPO->OSP: 6,900,000 slots (Batch: 961,538 slots/s / Avg: 1,057,309 slots/s)\n", + "14:01:27 INFO loader :: ** Index SPO->OSP: 6,991,623 slots indexed in 6.62 seconds [Rate: 1,056,136.38 per second]\n", + "14:01:27 INFO loader :: -- Finish triples index phase\n", + "14:01:27 INFO loader :: ** 6,991,623 triples indexed in 14.44 seconds [Rate: 484,217.97 per second]\n", + "14:01:27 INFO loader :: -- Finish triples load\n", + "14:01:27 INFO loader :: ** Completed: 6,991,623 triples loaded in 66.37 seconds [Rate: 105,339.95 per second]\n", + "14:01:27 INFO loader :: -- Finish quads load\n" ] } ], @@ -2351,7 +1079,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 42, "id": "1a0bfb4b-e694-40b1-88af-4446e3fcc888", "metadata": {}, "outputs": [ @@ -2381,11 +1109,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "id": "8695001d-9722-48a0-98e8-9ac5000551ea", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# 2024-03-14T09:40 : took <4min to run all the above." + ] } ], "metadata": { diff --git a/nmdc_runtime/api/endpoints/nmdcschema.py b/nmdc_runtime/api/endpoints/nmdcschema.py index 4b2dd90c..ac0cdf7a 100644 --- a/nmdc_runtime/api/endpoints/nmdcschema.py +++ b/nmdc_runtime/api/endpoints/nmdcschema.py @@ -1,10 +1,15 @@ from importlib.metadata import version import pymongo +from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON +from bson import json_util from fastapi import APIRouter, Depends, HTTPException from nmdc_runtime.minter.config import typecodes -from nmdc_runtime.util import nmdc_database_collection_names +from nmdc_runtime.util import ( + nmdc_database_collection_names, + collection_name_to_class_names, +) from pymongo.database import Database as MongoDatabase from starlette import status from toolz import dissoc @@ -12,9 +17,14 @@ from nmdc_runtime.api.core.metadata import map_id_to_collection, get_collection_for_id from nmdc_runtime.api.core.util import raise404_if_none from nmdc_runtime.api.db.mongo import get_mongo_db, nmdc_schema_collection_names -from nmdc_runtime.api.endpoints.util import list_resources +from nmdc_runtime.api.endpoints.util import list_resources, check_filter, FUSEKI_HOST from nmdc_runtime.api.models.metadata import Doc -from nmdc_runtime.api.models.util import ListRequest, ListResponse +from nmdc_runtime.api.models.util import ( + ListRequest, + ListResponse, + AssociationsRequest, + AssociationDirectionEnum, +) router = APIRouter() @@ -88,6 +98,88 @@ def get_nmdc_database_collection_stats( return stats +@router.get("/nmdcschema/associations") +def get_nmdc_schema_associations( + req: AssociationsRequest = Depends(), + mdb: MongoDatabase = Depends(get_mongo_db), +): + """ + For a given focus node of type nmdc:`start_type` that is found via `start_query`, + find target nodes of type nmdc:`target_type`. + + The `downstream` direction flows from studies to data objects, whereas `upstream` is the reverse, + traversing along the direction of dependency. + + `start_query` uses [MongoDB-like language querying](https://www.mongodb.com/docs/manual/tutorial/query-documents/). + + You should not use the Swagger UI for values of `limit` much larger than `1000`. + Set `limit` to `0` (zero) for no limit. + """ + start_type_collection_name, target_type_collection_name = None, None + for k, v in collection_name_to_class_names.items(): + if req.start_type in v: + start_type_collection_name = k + if req.target_type in v: + target_type_collection_name = k + if start_type_collection_name is None: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f'start_type "{req.start_type}" is not a known nmdc-schema class', + ) + if target_type_collection_name is None: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f'target_type "{req.target_type}" is not a known nmdc-schema class', + ) + + filter_ = json_util.loads(check_filter(req.start_query)) + if mdb[start_type_collection_name].count_documents(filter_) > 1: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f'start_query "{req.start_query}" yields more than one entity.', + ) + focus_node_ids = ( + [d["id"] for d in mdb[start_type_collection_name].find(filter_, ["id"])] + if filter_ + else None + ) + + values_stmt = ( + f"VALUES ?focus_node {{ {' '.join(focus_node_ids)} }}" if focus_node_ids else "" + ) + start_pattern = f"?focus_node nmdc:type nmdc:{req.start_type} ." + target_pattern = f"?o nmdc:type nmdc:{req.target_type} ." + downstream_pattern = "?o nmdc:depends_on+ ?focus_node ." + upstream_pattern = "?focus_node nmdc:depends_on+ ?o ." + upstream_where = ( + f"""{values_stmt} {start_pattern} {target_pattern} {upstream_pattern}""" + ) + downstream_where = ( + f"""{values_stmt} {start_pattern} {target_pattern} {downstream_pattern}""" + ) + limit = f"LIMIT {req.limit}" if req.limit != 0 else "" + query = f""" + PREFIX nmdc: + SELECT DISTINCT ?o WHERE {{ + {downstream_where if req.direction == AssociationDirectionEnum.downstream else upstream_where} + }} {limit}""" + + sparql = SPARQLWrapper(f"{FUSEKI_HOST}/nmdc") + sparql.setReturnFormat(SPARQL_JSON) + sparql.setQuery(query) + try: + ret = sparql.queryAndConvert() + return [ + b["o"]["value"].replace("https://w3id.org/nmdc/", "nmdc:") + for b in ret["results"]["bindings"] + ] + except Exception as e: + raise HTTPException( + status_code=status.HTTP_502_BAD_GATEWAY, + detail=str(e), + ) + + @router.get( "/nmdcschema/{collection_name}", response_model=ListResponse[Doc], diff --git a/nmdc_runtime/api/endpoints/util.py b/nmdc_runtime/api/endpoints/util.py index b6cda5da..18361e5e 100644 --- a/nmdc_runtime/api/endpoints/util.py +++ b/nmdc_runtime/api/endpoints/util.py @@ -58,6 +58,7 @@ BASE_URL_INTERNAL = os.getenv("API_HOST") BASE_URL_EXTERNAL = os.getenv("API_HOST_EXTERNAL") HOSTNAME_EXTERNAL = BASE_URL_EXTERNAL.split("://", 1)[-1] +FUSEKI_HOST = os.getenv("FUSEKI_HOST") def check_filter(filter_: str): diff --git a/nmdc_runtime/api/models/util.py b/nmdc_runtime/api/models/util.py index c298560e..6b366b20 100644 --- a/nmdc_runtime/api/models/util.py +++ b/nmdc_runtime/api/models/util.py @@ -1,8 +1,9 @@ +from enum import Enum from typing import TypeVar, List, Optional, Generic, Annotated from fastapi import Query -from pydantic import model_validator, Field, BaseModel +from pydantic import model_validator, Field, BaseModel, NonNegativeInt from typing_extensions import Annotated ResultT = TypeVar("ResultT") @@ -39,6 +40,19 @@ class ListRequest(BaseModel): PerPageRange = Annotated[int, Field(gt=0, le=2_000)] +class AssociationDirectionEnum(str, Enum): + upstream = "upstream" + downstream = "downstream" + + +class AssociationsRequest(BaseModel): + start_type: str + start_query: str + target_type: str + direction: AssociationDirectionEnum = AssociationDirectionEnum.downstream + limit: NonNegativeInt = 5 + + class FindRequest(BaseModel): filter: Optional[str] = None search: Optional[str] = None diff --git a/util/mongodump-nmdc.sh b/util/mongodump-nmdc.sh index 928f7b60..c214ee62 100755 --- a/util/mongodump-nmdc.sh +++ b/util/mongodump-nmdc.sh @@ -2,6 +2,10 @@ # Execute from repo root dir: # $ export $(grep -v '^#' .env.localhost.prod | xargs) # $ ./util/mongodump-nmdc.sh +# +# Note: consider getting a known backup of the production database, e.g. +# $ scp -r dtn01.nersc.gov:/global/cfs/cdirs/m3408/nmdc-mongodumps/dump_nmdc-prod_2024-03-11_20-12-02 . +# mongodump -h $MONGO_HOST -u $MONGO_USERNAME -p $MONGO_PASSWORD --authenticationDatabase=admin \ -d $MONGO_DBNAME \ --gzip -o $HOME/nmdcdb-mongodump/nmdcdb/$(date +"%Y-%m-%dT%H")/ \ From e9476552fccc7ba21b4d9767286be9bf312b8d6c Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Thu, 14 Mar 2024 13:52:09 -0400 Subject: [PATCH 08/18] fix(sparql): add optional auth --- nmdc_runtime/api/endpoints/nmdcschema.py | 10 +++++++++- nmdc_runtime/api/endpoints/util.py | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/nmdc_runtime/api/endpoints/nmdcschema.py b/nmdc_runtime/api/endpoints/nmdcschema.py index ac0cdf7a..6726324e 100644 --- a/nmdc_runtime/api/endpoints/nmdcschema.py +++ b/nmdc_runtime/api/endpoints/nmdcschema.py @@ -17,7 +17,13 @@ from nmdc_runtime.api.core.metadata import map_id_to_collection, get_collection_for_id from nmdc_runtime.api.core.util import raise404_if_none from nmdc_runtime.api.db.mongo import get_mongo_db, nmdc_schema_collection_names -from nmdc_runtime.api.endpoints.util import list_resources, check_filter, FUSEKI_HOST +from nmdc_runtime.api.endpoints.util import ( + list_resources, + check_filter, + FUSEKI_HOST, + FUSEKI_USER, + FUSEKI_PASSWD, +) from nmdc_runtime.api.models.metadata import Doc from nmdc_runtime.api.models.util import ( ListRequest, @@ -165,6 +171,8 @@ def get_nmdc_schema_associations( }} {limit}""" sparql = SPARQLWrapper(f"{FUSEKI_HOST}/nmdc") + sparql.user = FUSEKI_USER + sparql.passwd = FUSEKI_PASSWD sparql.setReturnFormat(SPARQL_JSON) sparql.setQuery(query) try: diff --git a/nmdc_runtime/api/endpoints/util.py b/nmdc_runtime/api/endpoints/util.py index 18361e5e..c843b401 100644 --- a/nmdc_runtime/api/endpoints/util.py +++ b/nmdc_runtime/api/endpoints/util.py @@ -59,6 +59,8 @@ BASE_URL_EXTERNAL = os.getenv("API_HOST_EXTERNAL") HOSTNAME_EXTERNAL = BASE_URL_EXTERNAL.split("://", 1)[-1] FUSEKI_HOST = os.getenv("FUSEKI_HOST") +FUSEKI_USER = os.getenv("FUSEKI_USER") +FUSEKI_PASSWD = os.getenv("FUSEKI_PASSWD") def check_filter(filter_: str): From dd1c234eb779d3b46557b7cbd7ce55a0f9593b26 Mon Sep 17 00:00:00 2001 From: Patrick Kalita Date: Mon, 11 Mar 2024 13:08:13 -0700 Subject: [PATCH 09/18] Upgrade to nmdc-schema 10.1.4 --- requirements/main.in | 2 +- requirements/main.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/main.in b/requirements/main.in index f6579797..f195a543 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -24,7 +24,7 @@ mkdocs-jupyter mkdocs-material mkdocs-mermaid2-plugin motor -nmdc-schema==10.1.2 +nmdc-schema==10.1.4 openpyxl pandas passlib[bcrypt] diff --git a/requirements/main.txt b/requirements/main.txt index 1ba94ea0..72cd2d0d 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -460,7 +460,7 @@ nbformat==5.9.2 # nbconvert nest-asyncio==1.6.0 # via ipykernel -nmdc-schema==10.1.2 +nmdc-schema==10.1.4 # via -r requirements/main.in notebook==7.1.1 # via jupyter From 3d4b2a0de06be5dcdb644d24f84d034920bfbe6f Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 11 Mar 2024 14:17:10 -0700 Subject: [PATCH 10/18] Create notebook that migrates database from schema `10.0.0` to `10.1.4` --- .../notebooks/migrate_10_0_0_to_10_1_4.ipynb | 575 ++++++++++++++++++ 1 file changed, 575 insertions(+) create mode 100644 demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb diff --git a/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb b/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb new file mode 100644 index 00000000..7310077e --- /dev/null +++ b/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb @@ -0,0 +1,575 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "source": [ + "# Migrate MongoDB database from `nmdc-schema` `v10.0.0` to `v10.1.4`\n", + "\n", + "- TODO: Disable read/write access to the origin database during the migration process." + ] + }, + { + "cell_type": "markdown", + "id": "f65ad4ab", + "metadata": {}, + "source": [ + "## Prerequisites" + ] + }, + { + "cell_type": "markdown", + "id": "37d358ba", + "metadata": {}, + "source": [ + "### 1. Determine MongoDB collections involved.\n", + "\n", + "Here, you'll determine which MongoDB collections will be used as part of this migration.\n", + "\n", + "1. In the [`nmdc-schema` repo](https://github.com/microbiomedata/nmdc-schema/tree/main/nmdc_schema/migrators), go to the `nmdc_schema/migrators` directory and open the Python module whose name contains the two schema versions involved with this migration. For example, if migrating from schema version `A.B.C` to `X.Y.Z`, open the module named `migrator_from_A_B_C_to_X_Y_Z.py`.\n", + "2. Determine the collections that are accessed—whether for reading or for writing—by that module. **This is currently a manual process.**\n", + "3. Add their names to the `COLLECTION_NAMES` Python list below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09966b0d", + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-11T20:22:46.752014Z", + "start_time": "2024-03-11T20:22:46.747606Z" + } + }, + "outputs": [], + "source": [ + "COLLECTION_NAMES: list[str] = [\n", + " \"data_object_set\"\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "17f351e8", + "metadata": {}, + "source": [ + "### 2. Coordinate with stakeholders.\n", + "\n", + "Identify the people that read/write to those collections, or that maintain software that reads/writes to those collection. You can view a list of stakeholders in `./stakeholders.md`. \n", + "\n", + "Once you have identified those people; coordinate with them to agree on a time window for the migration. You can contact them via Slack, for example." + ] + }, + { + "cell_type": "markdown", + "id": "233a35c3", + "metadata": {}, + "source": [ + "### 3. Set up environment.\n", + "\n", + "Here, you'll prepare an environment for running this notebook.\n", + "\n", + "1. Start a **MongoDB server** on your local machine (and ensure it does **not** already contain a database named `nmdc`).\n", + " 1. You can start a [Docker](https://hub.docker.com/_/mongo)-based MongoDB server at `localhost:27055` by running this command (this MongoDB server will be accessible without a username or password).\n", + " ```shell\n", + " docker run --rm --detach --name mongo-migration-transformer -p 27055:27017 mongo:6.0.4\n", + " ```\n", + "2. Create and populate a **notebook configuration file** named `.notebook.env`.\n", + " 1. You can use `.notebook.env.example` as a template:\n", + " ```shell\n", + " $ cp .notebook.env.example .notebook.env\n", + " ```\n", + "3. Create and populate the two **MongoDB configuration files** that this notebook will use to connect to the \"origin\" and \"transformer\" MongoDB servers. The \"origin\" MongoDB server is the one that contains the database you want to migrate; and the \"transformer\" MongoDB server is the one you want to use to perform the data transformations. In practice, the \"origin\" MongoDB server is typically a remote server, and the \"transformer\" MongoDB server is typically a local server.\n", + " 1. You can use `.mongo.yaml.example` as a template:\n", + " ```shell\n", + " $ cp .mongo.yaml.example .mongo.origin.yaml\n", + " $ cp .mongo.yaml.example .mongo.transformer.yaml\n", + " ```\n", + " > When populating the file for the origin MongoDB server, use credentials that have **both read and write access** to the `nmdc` database." + ] + }, + { + "cell_type": "markdown", + "id": "69937b18", + "metadata": {}, + "source": [ + "## Procedure" + ] + }, + { + "cell_type": "markdown", + "id": "fe81196a", + "metadata": {}, + "source": [ + "### Install Python dependencies\n", + "\n", + "In this step, you'll [install](https://saturncloud.io/blog/what-is-the-difference-between-and-in-jupyter-notebooks/) the Python packages upon which this notebook depends.\n", + "\n", + "> Note: If the output of this cell says \"Note: you may need to restart the kernel to use updated packages\", restart the kernel (not the notebook) now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e25a0af308c3185b", + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%pip install -r requirements.txt\n", + "%pip install nmdc-schema==10.1.4" + ] + }, + { + "cell_type": "markdown", + "id": "a407c354", + "metadata": {}, + "source": [ + "### Import Python dependencies\n", + "\n", + "Import the Python objects upon which this notebook depends.\n", + "\n", + "> Note: One of the `import` statements is specific to this migration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbecd561", + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-05T00:46:18.764498Z", + "start_time": "2024-03-05T00:46:18.202997Z" + } + }, + "outputs": [], + "source": [ + "# Stdlib packages:\n", + "from copy import deepcopy\n", + "\n", + "# Third-party packages:\n", + "import pymongo\n", + "from jsonschema import Draft7Validator, ValidationError\n", + "from nmdc_schema.nmdc_data import get_nmdc_jsonschema_dict\n", + "from nmdc_schema.migrators.adapters.mongo_adapter import MongoAdapter\n", + "\n", + "from nmdc_schema.migrators.migrator_from_10_0_0_to_10_1_2 import Migrator # note: the migrator to 10.1.2 was introduced in schema version 10.1.4\n", + "\n", + "# First-party packages:\n", + "from helpers import Config\n", + "from bookkeeper import Bookkeeper, MigrationEvent" + ] + }, + { + "cell_type": "markdown", + "id": "99b20ff4", + "metadata": {}, + "source": [ + "### Parse configuration files\n", + "\n", + "Parse the notebook and Mongo configuration files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1eac645a", + "metadata": {}, + "outputs": [], + "source": [ + "cfg = Config()\n", + "\n", + "# Define some aliases we can use to make the shell commands in this notebook easier to read.\n", + "mongodump = cfg.mongodump_path\n", + "mongorestore = cfg.mongorestore_path\n", + "\n", + "# Perform a sanity test of the application paths.\n", + "!{mongodump} --version\n", + "!{mongorestore} --version" + ] + }, + { + "cell_type": "markdown", + "id": "68245d2b", + "metadata": {}, + "source": [ + "### Create MongoDB clients\n", + "\n", + "Create MongoDB clients you can use to access the \"origin\" and \"transformer\" MongoDB servers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e95f559", + "metadata": {}, + "outputs": [], + "source": [ + "# Mongo client for \"origin\" MongoDB server.\n", + "origin_mongo_client = pymongo.MongoClient(host=cfg.origin_mongo_server_uri, directConnection=True)\n", + "\n", + "# Mongo client for \"transformer\" MongoDB server.\n", + "transformer_mongo_client = pymongo.MongoClient(host=cfg.transformer_mongo_server_uri)\n", + "\n", + "# Perform sanity tests of those MongoDB clients' abilities to access their respective MongoDB servers.\n", + "with pymongo.timeout(3):\n", + " # Display the MongoDB server version (running on the \"origin\" Mongo server).\n", + " print(\"Origin Mongo server version: \" + origin_mongo_client.server_info()[\"version\"])\n", + "\n", + " # Sanity test: Ensure the origin database exists.\n", + " assert \"nmdc\" in origin_mongo_client.list_database_names(), \"Origin database does not exist.\"\n", + "\n", + " # Display the MongoDB server version (running on the \"transformer\" Mongo server).\n", + " print(\"Transformer Mongo server version: \" + transformer_mongo_client.server_info()[\"version\"])\n", + "\n", + " # Sanity test: Ensure the transformation database does not exist.\n", + " assert \"nmdc\" not in transformer_mongo_client.list_database_names(), \"Transformation database already exists.\"" + ] + }, + { + "cell_type": "markdown", + "id": "bc387abc62686091", + "metadata": { + "collapsed": false + }, + "source": [ + "### Create a bookkeeper\n", + "\n", + "Create a `Bookkeeper` that can be used to document migration events in the \"origin\" server." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c982eb0c04e606d", + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "bookkeeper = Bookkeeper(mongo_client=origin_mongo_client)" + ] + }, + { + "cell_type": "markdown", + "id": "3975ac24", + "metadata": {}, + "source": [ + "### Create JSON Schema validator\n", + "\n", + "In this step, you'll create a JSON Schema validator for the NMDC Schema." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e2dbb92", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_id_pattern_constraints(nmdc_schema: dict) -> dict:\n", + " r\"\"\"\n", + " Returns a variant of the schema having no `$defs[*].properties.id.pattern` properties.\n", + "\n", + " Note: This algorithm was copied from the `without_id_patterns` function in `nmdc_runtime/util.py`.\n", + " \"\"\"\n", + " custom_schema = deepcopy(nmdc_schema)\n", + " for _, spec in custom_schema[\"$defs\"].items():\n", + " if \"properties\" in spec and \"id\" in spec[\"properties\"] and \"pattern\" in spec[\"properties\"][\"id\"]:\n", + " del spec[\"properties\"][\"id\"][\"pattern\"]\n", + " return custom_schema\n", + "\n", + "\n", + "# Make a version of the NMDC Schema that accepts so-called \"legacy IDs\".\n", + "nmdc_jsonschema: dict = remove_id_pattern_constraints(get_nmdc_jsonschema_dict())\n", + "nmdc_jsonschema_validator = Draft7Validator(nmdc_jsonschema)\n", + "\n", + "# Perform sanity tests of the NMDC Schema dictionary and the JSON Schema validator.\n", + "# Reference: https://python-jsonschema.readthedocs.io/en/latest/api/jsonschema/protocols/#jsonschema.protocols.Validator.check_schema\n", + "print(\"NMDC Schema title: \" + nmdc_jsonschema[\"title\"])\n", + "print(\"NMDC Schema version: \" + nmdc_jsonschema[\"version\"])\n", + "\n", + "nmdc_jsonschema_validator.check_schema(nmdc_jsonschema) # raises exception if schema is invalid" + ] + }, + { + "cell_type": "markdown", + "id": "fd4994a0", + "metadata": {}, + "source": [ + "### Dump collections from the \"origin\" MongoDB server\n", + "\n", + "Use `mongodump` to dump the collections involved in this migration **from** the \"origin\" MongoDB server **into** a local directory.\n", + "\n", + "> Since `mongodump` doesn't provide a CLI option we can use to specify the collections we _want_ the dump to include, we use multiple occurrences of the `--excludeCollection` CLI option to exclude each collection we do _not_ want the dump to include. The end result is the same—there's just that extra step involved." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf8fa1ca", + "metadata": {}, + "outputs": [], + "source": [ + "# Build a string containing zero or more `--excludeCollection=\"...\"` options, which can be included in a `mongodump` command.\n", + "all_collection_names: list[str] = origin_mongo_client[\"nmdc\"].list_collection_names()\n", + "non_agenda_collection_names = [name for name in all_collection_names if name not in COLLECTION_NAMES]\n", + "exclusion_options = [f\"--excludeCollection='{name}'\" for name in non_agenda_collection_names]\n", + "exclusion_options_str = \" \".join(exclusion_options) # separates each option with a space\n", + "print(exclusion_options_str)\n", + "\n", + "# Dump the not-excluded collections from the \"origin\" database.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.origin_dump_folder_path}\" \\\n", + " {exclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "id": "c3e3c9c4", + "metadata": {}, + "source": [ + "### Load the dumped collections into the \"transformer\" MongoDB server\n", + "\n", + "Use `mongorestore` to load the dumped collections **from** the local directory **into** the \"transformer\" MongoDB server.\n", + "\n", + "> Since it's possible that the dump included extra collections (due to someone having created a collection between the time you generated the `--excludeCollection` CLI options and the time you ran `mongodump` above), we will use the `--nsInclude` CLI option to indicate which specific collections—from the dump—we want to load into the \"transformer\" database." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "418571c5", + "metadata": {}, + "outputs": [], + "source": [ + "# Build a string containing zero or more `--nsInclude=\"...\"` options, which can be included in a `mongorestore` command.\n", + "inclusion_options = [f\"--nsInclude='nmdc.{name}'\" for name in COLLECTION_NAMES]\n", + "inclusion_options_str = \" \".join(inclusion_options) # separates each option with a space\n", + "print(inclusion_options_str)\n", + "\n", + "# Restore the dumped collections to the \"transformer\" MongoDB server.\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --drop \\\n", + " --preserveUUID \\\n", + " --dir=\"{cfg.origin_dump_folder_path}\" \\\n", + " {inclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "id": "4c090068", + "metadata": {}, + "source": [ + "### Transform the collections within the \"transformer\" MongoDB server\n", + "\n", + "Use the migrator to transform the collections in the \"transformer\" database.\n", + "\n", + "> Reminder: The database transformation functions are defined in the `nmdc-schema` Python package installed earlier.\n", + "\n", + "> Reminder: The \"origin\" database is **not** affected by this step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05869340", + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate a MongoAdapter bound to the \"transformer\" database.\n", + "adapter = MongoAdapter(\n", + " database=transformer_mongo_client[\"nmdc\"],\n", + " # Note: These callbacks aren't support yet, as of nmdc-schema 10.1.4.\n", + " # on_collection_created=lambda name: print(f'Created collection \"{name}\"'),\n", + " # on_collection_renamed=lambda old_name, name: print(f'Renamed collection \"{old_name}\" to \"{name}\"'),\n", + " # on_collection_deleted=lambda name: print(f'Deleted collection \"{name}\"'),\n", + ")\n", + "\n", + "# Instantiate a Migrator bound to that adapter.\n", + "migrator = Migrator(adapter=adapter)\n", + "\n", + "# Execute the Migrator's `upgrade` method to perform the migration.\n", + "migrator.upgrade()" + ] + }, + { + "cell_type": "markdown", + "id": "3edf77c7", + "metadata": {}, + "source": [ + "### Validate the transformed documents\n", + "\n", + "Now that we have transformed the database, validate each document in each collection in the \"transformer\" MongoDB server.\n", + "\n", + "> Reference: https://github.com/microbiomedata/nmdc-runtime/blob/main/metadata-translation/src/bin/validate_json.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db6e432d", + "metadata": {}, + "outputs": [], + "source": [ + "for collection_name in COLLECTION_NAMES:\n", + " collection = transformer_mongo_client[\"nmdc\"][collection_name]\n", + " for document in collection.find():\n", + " # Validate the transformed document.\n", + " #\n", + " # Reference: https://github.com/microbiomedata/nmdc-schema/blob/main/src/docs/schema-validation.md\n", + " #\n", + " # Note: Dictionaries originating as Mongo documents include a Mongo-generated key named `_id`. However,\n", + " # the NMDC Schema does not describe that key and, indeed, data validators consider dictionaries\n", + " # containing that key to be invalid with respect to the NMDC Schema. So, here, we validate a\n", + " # copy (i.e. a shallow copy) of the document that lacks that specific key.\n", + " #\n", + " # Note: `root_to_validate` is a dictionary having the shape: { \"some_collection_name\": [ some_document ] }\n", + " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", + " #\n", + " document_without_underscore_id_key = {key: value for key, value in document.items() if key != \"_id\"}\n", + " root_to_validate = dict([(collection_name, [document_without_underscore_id_key])])\n", + " try:\n", + " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid\n", + " except ValidationError as err:\n", + " # Print the offending document (to facilitate debug) before propagating the exception.\n", + " print(document)\n", + " raise err" + ] + }, + { + "cell_type": "markdown", + "id": "997fcb281d9d3222", + "metadata": { + "collapsed": false + }, + "source": [ + "### Indicate that the migration is underway\n", + "\n", + "Add an entry to the migration log collection to indicate that this migration has started." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcafd862e1becb98", + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_STARTED)" + ] + }, + { + "cell_type": "markdown", + "id": "1e0c8891", + "metadata": {}, + "source": [ + "### Dump the collections from the \"transformer\" MongoDB server\n", + "\n", + "Now that the collections have been transformed and validated, dump them **from** the \"transformer\" MongoDB server **into** a local directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca49f61a", + "metadata": {}, + "outputs": [], + "source": [ + "# Dump the database from the \"transformer\" MongoDB server.\n", + "!{mongodump} \\\n", + " --config=\"{cfg.transformer_mongo_config_file_path}\" \\\n", + " --db=\"nmdc\" \\\n", + " --gzip \\\n", + " --out=\"{cfg.transformer_dump_folder_path}\" \\\n", + " {exclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "id": "d84bdc11", + "metadata": {}, + "source": [ + "### Load the collections into the \"origin\" MongoDB server\n", + "\n", + "Load the transformed collections into the \"origin\" MongoDB server, **replacing** the collections there that have the same names.\n", + "\n", + "> Note: If the migration involved renaming or deleting a collection, the collection having the original name will continue to exist in the \"origin\" database until someone deletes it manually." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dfbcf0a", + "metadata": {}, + "outputs": [], + "source": [ + "# Replace the same-named collection(s) on the origin server, with the transformed one(s).\n", + "!{mongorestore} \\\n", + " --config=\"{cfg.origin_mongo_config_file_path}\" \\\n", + " --gzip \\\n", + " --verbose \\\n", + " --dir=\"{cfg.transformer_dump_folder_path}\" \\\n", + " --drop \\\n", + " --preserveUUID \\\n", + " {inclusion_options_str}" + ] + }, + { + "cell_type": "markdown", + "id": "ca5ee89a79148499", + "metadata": { + "collapsed": false + }, + "source": [ + "### Indicate that the migration is complete\n", + "\n", + "Add an entry to the migration log collection to indicate that this migration is complete." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1eaa6c92789c4f3", + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "bookkeeper.record_migration_event(migrator=migrator, event=MigrationEvent.MIGRATION_COMPLETED)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From ff1c7fbfa20893d90342946ee4fefb75e6b59696 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 11 Mar 2024 14:51:50 -0700 Subject: [PATCH 11/18] Temporarily patch known-invalid fields before validating documents --- .../notebooks/migrate_10_0_0_to_10_1_4.ipynb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb b/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb index 7310077e..84c98241 100644 --- a/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb @@ -437,6 +437,12 @@ " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", " #\n", " document_without_underscore_id_key = {key: value for key, value in document.items() if key != \"_id\"}\n", + " \n", + " # Patch documents that contain invalid data, while the team is figuring out how we'll handle these long term.\n", + " for field_name in [\"compression_type\", \"was_generated_by\"]:\n", + " if field_name in document_without_underscore_id_key and document_without_underscore_id_key[field_name] is None:\n", + " document_without_underscore_id_key[field_name] = \"\"\n", + "\n", " root_to_validate = dict([(collection_name, [document_without_underscore_id_key])])\n", " try:\n", " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid\n", From 53a17945254d8f68a04678c0905b3e0dca19856b Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 11 Mar 2024 16:56:34 -0700 Subject: [PATCH 12/18] Refrain from patching known-invalid fields (will fix source database instead) This commit also includes a minor simplification to the `remove_id_pattern_constraints` function. --- .../notebooks/migrate_10_0_0_to_10_1_4.ipynb | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb b/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb index 84c98241..c57b7f18 100644 --- a/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb +++ b/demo/metadata_migration/notebooks/migrate_10_0_0_to_10_1_4.ipynb @@ -278,7 +278,7 @@ " Note: This algorithm was copied from the `without_id_patterns` function in `nmdc_runtime/util.py`.\n", " \"\"\"\n", " custom_schema = deepcopy(nmdc_schema)\n", - " for _, spec in custom_schema[\"$defs\"].items():\n", + " for spec in custom_schema[\"$defs\"].values():\n", " if \"properties\" in spec and \"id\" in spec[\"properties\"] and \"pattern\" in spec[\"properties\"][\"id\"]:\n", " del spec[\"properties\"][\"id\"][\"pattern\"]\n", " return custom_schema\n", @@ -437,12 +437,6 @@ " # Reference: https://docs.python.org/3/library/stdtypes.html#dict (see the \"type constructor\" section)\n", " #\n", " document_without_underscore_id_key = {key: value for key, value in document.items() if key != \"_id\"}\n", - " \n", - " # Patch documents that contain invalid data, while the team is figuring out how we'll handle these long term.\n", - " for field_name in [\"compression_type\", \"was_generated_by\"]:\n", - " if field_name in document_without_underscore_id_key and document_without_underscore_id_key[field_name] is None:\n", - " document_without_underscore_id_key[field_name] = \"\"\n", - "\n", " root_to_validate = dict([(collection_name, [document_without_underscore_id_key])])\n", " try:\n", " nmdc_jsonschema_validator.validate(root_to_validate) # raises exception if invalid\n", From 3fbac747f2d8da8ae454595a325910200d4d69d3 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Fri, 15 Mar 2024 13:49:25 -0400 Subject: [PATCH 13/18] fix(validation): allow any valid nmdc:Database to [/v1]/workflows/activities (#497) Documents other than activities may be generated and submittable at the same time. closes #462 --- nmdc_runtime/api/endpoints/workflows.py | 9 +++------ nmdc_runtime/api/v1/workflows/activities.py | 4 ---- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/nmdc_runtime/api/endpoints/workflows.py b/nmdc_runtime/api/endpoints/workflows.py index 445153d0..8d9029eb 100644 --- a/nmdc_runtime/api/endpoints/workflows.py +++ b/nmdc_runtime/api/endpoints/workflows.py @@ -69,7 +69,9 @@ async def post_activity( Parameters ------- activity_set: dict[str,Any] - Set of activities for specific workflows. + Set of activities for specific workflows, in the form of a nmdc:Database. + Other collections (such as data_object_set) are allowed, as they may be associated + with the activities submitted. Returns ------- @@ -78,10 +80,6 @@ async def post_activity( """ _ = site # must be authenticated try: - # verify activities in activity_set are nmdc-schema compliant - for collection_name in activity_set: - if collection_name not in activity_collection_names(mdb): - raise ValueError("keys must be nmdc-schema activity collection names`") # validate request JSON rv = validate_json(activity_set, mdb) if rv["result"] == "errors": @@ -97,7 +95,6 @@ async def post_activity( password=os.getenv("MONGO_PASSWORD"), ) mongo_resource.add_docs(activity_set, validate=False, replace=True) - # TODO: Update return value to List[Activity] return {"message": "jobs accepted"} except BulkWriteError as e: raise HTTPException(status_code=409, detail=str(e)) diff --git a/nmdc_runtime/api/v1/workflows/activities.py b/nmdc_runtime/api/v1/workflows/activities.py index 1112948d..4c490a14 100644 --- a/nmdc_runtime/api/v1/workflows/activities.py +++ b/nmdc_runtime/api/v1/workflows/activities.py @@ -46,10 +46,6 @@ async def post_activity( """ _ = site # must be authenticated try: - # verify activities in activity_set are nmdc-schema compliant - for collection_name in activity_set: - if collection_name not in activity_collection_names(mdb): - raise ValueError("keys must be nmdc-schema activity collection names`") # validate request JSON rv = validate_json(activity_set, mdb) if rv["result"] == "errors": From 2d42ddc0ac3340b2af308d784157d90e4f94ea13 Mon Sep 17 00:00:00 2001 From: Jing Cao Date: Wed, 20 Mar 2024 13:11:20 -0400 Subject: [PATCH 14/18] update docs for changesheet validator to include `remove` action (#499) * update changesheet docs to include `remove` action * Update author-changesheets.md * Update author-changesheets.md --------- Co-authored-by: Jing Co-authored-by: Donny Winston --- docs/howto-guides/author-changesheets.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/howto-guides/author-changesheets.md b/docs/howto-guides/author-changesheets.md index e4e9d2ef..6c7886f6 100644 --- a/docs/howto-guides/author-changesheets.md +++ b/docs/howto-guides/author-changesheets.md @@ -31,7 +31,8 @@ flowchart LR; * `id`: The *id* value corresponding to the *id* of the JSON document in the database. Specifying this will tell the changesheet what record in the database needs to be modified. There are no restrictions on the ids that can be modified. For example, it can be a Biosample *id* (with typecode *bsm*), or a Study *id* (with typecode *sty*), or another class of *id*. * `action`: The action to be performed on the database. It may be one of the following: * `insert` / `insert item` / `insert items`: Add new values to a multivalued field, i.e., a field/key on a document which captures a list of values instead of single values. - * `remove item` / `remove items`: Remove attributes/keys on a document. + * `remove`: Drop a key/value pair for a single-value slot. Leave `value` field empty on changesheet. + * `remove item` / `remove items`: Remove item(s) from a list/set of values for a multivalued slot. * `update` / `set` / `replace` / `replace items`: Update the value of a particular field/key on a document and replace it with a new value. * `attribute`: the name of the field/key in the NMDC JSON document that is to be modified. * `value`: New value, which may be added (if it wasn't present already) to a multi-valued field for an `insert` action. For an `update` action, it will overwrite any current value. From 0b1d441786d60fe881efe3717f5a257ce154be21 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Tue, 26 Mar 2024 14:01:35 -0400 Subject: [PATCH 15/18] fix: remove unused file (#500) thanks @eecavanna --- nmdc_runtime/minter/nmdc.schema.json | 5573 -------------------------- 1 file changed, 5573 deletions(-) delete mode 100644 nmdc_runtime/minter/nmdc.schema.json diff --git a/nmdc_runtime/minter/nmdc.schema.json b/nmdc_runtime/minter/nmdc.schema.json deleted file mode 100644 index 71442f7a..00000000 --- a/nmdc_runtime/minter/nmdc.schema.json +++ /dev/null @@ -1,5573 +0,0 @@ -{ - "$defs": { - "Activity": { - "additionalProperties": false, - "description": "a provence-generating activity", - "properties": { - "ended_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):act-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "started_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "used": { - "type": "string" - }, - "was_associated_with": { - "$ref": "#/$defs/Agent" - }, - "was_informed_by": { - "type": "string" - } - }, - "required": [ - "id" - ], - "title": "Activity", - "type": "object" - }, - "Agent": { - "additionalProperties": false, - "description": "a provence-generating agent", - "properties": { - "acted_on_behalf_of": { - "$ref": "#/$defs/Agent" - }, - "was_informed_by": { - "type": "string" - } - }, - "title": "Agent", - "type": "object" - }, - "AnalysisTypeEnum": { - "description": "", - "enum": [ - "metabolomics", - "metagenomics", - "metaproteomics", - "metatranscriptomics", - "natural organic matter" - ], - "title": "AnalysisTypeEnum", - "type": "string" - }, - "AnalyticalSample": { - "additionalProperties": false, - "description": "", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):ansm-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - } - }, - "required": [ - "id" - ], - "title": "AnalyticalSample", - "type": "object" - }, - "ArchStrucEnum": { - "description": "", - "enum": [ - "building", - "shed", - "home" - ], - "title": "ArchStrucEnum", - "type": "string" - }, - "AttributeValue": { - "additionalProperties": false, - "description": "The value for any value of a attribute for a sample. This object can hold both the un-normalized atomic value and the structured value", - "properties": { - "has_raw_value": { - "description": "The value that was specified for an annotation in raw form, i.e. a string. E.g. \"2 cm\" or \"2-4 cm\"", - "type": "string" - }, - "was_generated_by": { - "type": "string" - } - }, - "title": "AttributeValue", - "type": "object" - }, - "BiolStatEnum": { - "description": "", - "enum": [ - "wild", - "natural", - "semi-natural", - "inbred line", - "breeder's line", - "hybrid", - "clonal selection", - "mutant" - ], - "title": "BiolStatEnum", - "type": "string" - }, - "Biosample": { - "additionalProperties": false, - "description": "Biological source material which can be characterized by an experiment.", - "properties": { - "add_date": { - "description": "The date on which the information was added to the database.", - "type": "string" - }, - "agrochem_addition": { - "$ref": "#/$defs/QuantityValue", - "description": "Addition of fertilizers, pesticides, etc. - amount and time of applications" - }, - "air_temp_regm": { - "$ref": "#/$defs/QuantityValue", - "description": "Information about treatment involving an exposure to varying temperatures; should include the temperature, treatment regimen including how many times the treatment was repeated, how long each treatment lasted, and the start and end time of the entire treatment; can include different temperature regimens" - }, - "al_sat": { - "$ref": "#/$defs/QuantityValue", - "description": "Aluminum saturation (esp. For tropical soils)" - }, - "al_sat_meth": { - "$ref": "#/$defs/TextValue", - "description": "Reference or method used in determining Al saturation" - }, - "alkalinity": { - "$ref": "#/$defs/QuantityValue", - "description": "Alkalinity, the ability of a solution to neutralize acids to the equivalence point of carbonate or bicarbonate" - }, - "alkalinity_method": { - "$ref": "#/$defs/TextValue", - "description": "Method used for alkalinity measurement" - }, - "alkyl_diethers": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of alkyl diethers" - }, - "alt": { - "$ref": "#/$defs/QuantityValue", - "description": "Altitude is a term used to identify heights of objects such as airplanes, space shuttles, rockets, atmospheric balloons and heights of places such as atmospheric layers and clouds. It is used to measure the height of an object which is above the earth's surface. In this context, the altitude measurement is the vertical distance between the earth's surface above sea level and the sampled position in the air" - }, - "alternative_identifiers": { - "description": "Unique identifier for a biosample submitted to additional resources. Matches the entity that has been submitted to NMDC", - "items": { - "type": "string" - }, - "type": "array" - }, - "aminopept_act": { - "$ref": "#/$defs/QuantityValue", - "description": "Measurement of aminopeptidase activity" - }, - "ammonium": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of ammonium in the sample" - }, - "ammonium_nitrogen": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of ammonium nitrogen in the sample" - }, - "analysis_type": { - "description": "Select all the data types associated or available for this biosample", - "items": { - "$ref": "#/$defs/AnalysisTypeEnum" - }, - "type": "array" - }, - "annual_precpt": { - "$ref": "#/$defs/QuantityValue", - "description": "The average of all annual precipitation values known, or an estimated equivalent value derived by such methods as regional indexes or Isohyetal maps." - }, - "annual_temp": { - "$ref": "#/$defs/QuantityValue", - "description": "Mean annual temperature" - }, - "bacteria_carb_prod": { - "$ref": "#/$defs/QuantityValue", - "description": "Measurement of bacterial carbon production" - }, - "biosample_categories": { - "items": { - "$ref": "#/$defs/BiosampleCategoryEnum" - }, - "type": "array" - }, - "biotic_regm": { - "$ref": "#/$defs/TextValue", - "description": "Information about treatment(s) involving use of biotic factors, such as bacteria, viruses or fungi." - }, - "biotic_relationship": { - "$ref": "#/$defs/TextValue", - "description": "Description of relationship(s) between the subject organism and other organism(s) it is associated with. E.g., parasite on species X; mutualist with species Y. The target organism is the subject of the relationship, and the other organism(s) is the object" - }, - "bishomohopanol": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of bishomohopanol" - }, - "bromide": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of bromide" - }, - "calcium": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of calcium in the sample" - }, - "carb_nitro_ratio": { - "$ref": "#/$defs/QuantityValue", - "description": "Ratio of amount or concentrations of carbon to nitrogen" - }, - "chem_administration": { - "$ref": "#/$defs/ControlledTermValue", - "description": "List of chemical compounds administered to the host or site where sampling occurred, and when (e.g. Antibiotics, n fertilizer, air filter); can include multiple compounds. For chemical entities of biological interest ontology (chebi) (v 163), http://purl.bioontology.org/ontology/chebi" - }, - "chloride": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of chloride in the sample" - }, - "chlorophyll": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of chlorophyll" - }, - "climate_environment": { - "$ref": "#/$defs/TextValue", - "description": "Treatment involving an exposure to a particular climate; treatment regimen including how many times the treatment was repeated, how long each treatment lasted, and the start and end time of the entire treatment; can include multiple climates" - }, - "collected_from": { - "description": "The Site from which a Biosample was collected", - "type": "string" - }, - "collection_date": { - "$ref": "#/$defs/TimestampValue", - "description": "The time of sampling, either as an instance (single point in time) or interval. In case no exact time is available, the date/time can be right truncated i.e. all of these are valid times: 2008-01-23T19:23:10+00:00; 2008-01-23T19:23:10; 2008-01-23; 2008-01; 2008; Except: 2008-01; 2008 all are ISO8601 compliant" - }, - "collection_date_inc": { - "description": "Date the incubation was harvested/collected/ended. Only relevant for incubation samples.", - "type": "string" - }, - "collection_time": { - "description": "The time of sampling, either as an instance (single point) or interval.", - "type": "string" - }, - "collection_time_inc": { - "description": "Time the incubation was harvested/collected/ended. Only relevant for incubation samples.", - "type": "string" - }, - "community": { - "type": "string" - }, - "crop_rotation": { - "$ref": "#/$defs/TextValue", - "description": "Whether or not crop is rotated, and if yes, rotation schedule" - }, - "cur_land_use": { - "$ref": "#/$defs/TextValue", - "description": "Present state of sample site" - }, - "cur_vegetation": { - "$ref": "#/$defs/TextValue", - "description": "Vegetation classification from one or more standard classification systems, or agricultural crop" - }, - "cur_vegetation_meth": { - "$ref": "#/$defs/TextValue", - "description": "Reference or method used in vegetation classification" - }, - "density": { - "$ref": "#/$defs/QuantityValue", - "description": "Density of the sample, which is its mass per unit volume (aka volumetric mass density)" - }, - "depth": { - "$ref": "#/$defs/QuantityValue", - "description": "The vertical distance below local surface, e.g. for sediment or soil samples depth is measured from sediment or soil surface, respectively. Depth can be reported as an interval for subsurface samples." - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "diss_carb_dioxide": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of dissolved carbon dioxide in the sample or liquid portion of the sample" - }, - "diss_hydrogen": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of dissolved hydrogen" - }, - "diss_inorg_carb": { - "$ref": "#/$defs/QuantityValue", - "description": "Dissolved inorganic carbon concentration in the sample, typically measured after filtering the sample using a 0.45 micrometer filter" - }, - "diss_inorg_phosp": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of dissolved inorganic phosphorus in the sample" - }, - "diss_org_carb": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of dissolved organic carbon in the sample, liquid portion of the sample, or aqueous phase of the fluid" - }, - "diss_org_nitro": { - "$ref": "#/$defs/QuantityValue", - "description": "Dissolved organic nitrogen concentration measured as; total dissolved nitrogen - NH4 - NO3 - NO2" - }, - "diss_oxygen": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of dissolved oxygen" - }, - "dna_absorb1": { - "description": "260/280 measurement of DNA sample purity", - "type": "string" - }, - "dna_absorb2": { - "description": "260/230 measurement of DNA sample purity", - "type": "string" - }, - "dna_collect_site": { - "description": "Provide information on the site your DNA sample was collected from", - "type": "string" - }, - "dna_concentration": { - "maximum": 2000, - "minimum": 0, - "type": "string" - }, - "dna_cont_type": { - "$ref": "#/$defs/DnaContTypeEnum", - "description": "Tube or plate (96-well)" - }, - "dna_cont_well": { - "pattern": "^(?!A1|A12|H1|H12)(([A-H][1-9])|([A-H]1[0-2]))$", - "type": "string" - }, - "dna_container_id": { - "type": "string" - }, - "dna_dnase": { - "$ref": "#/$defs/DnaDnaseEnum" - }, - "dna_isolate_meth": { - "description": "Describe the method/protocol/kit used to extract DNA/RNA.", - "type": "string" - }, - "dna_organisms": { - "description": "List any organisms known or suspected to grow in co-culture, as well as estimated % of the organism in that culture.", - "type": "string" - }, - "dna_project_contact": { - "type": "string" - }, - "dna_samp_id": { - "type": "string" - }, - "dna_sample_format": { - "$ref": "#/$defs/DnaSampleFormatEnum", - "description": "Solution in which the DNA sample has been suspended" - }, - "dna_sample_name": { - "description": "Give the DNA sample a name that is meaningful to you. Sample names must be unique across all JGI projects and contain a-z, A-Z, 0-9, - and _ only.", - "type": "string" - }, - "dna_seq_project": { - "type": "string" - }, - "dna_seq_project_name": { - "type": "string" - }, - "dna_seq_project_pi": { - "type": "string" - }, - "dna_volume": { - "maximum": 1000, - "minimum": 0, - "type": "string" - }, - "dnase_rna": { - "$ref": "#/$defs/DnaseRnaEnum" - }, - "drainage_class": { - "$ref": "#/$defs/TextValue", - "description": "Drainage classification from a standard system such as the USDA system" - }, - "ecosystem": { - "description": "An ecosystem is a combination of a physical environment (abiotic factors) and all the organisms (biotic factors) that interact with this environment. Ecosystem is in position 1/5 in a GOLD path.", - "type": "string" - }, - "ecosystem_category": { - "description": "Ecosystem categories represent divisions within the ecosystem based on specific characteristics of the environment from where an organism or sample is isolated. Ecosystem category is in position 2/5 in a GOLD path.", - "type": "string" - }, - "ecosystem_subtype": { - "description": "Ecosystem subtypes represent further subdivision of Ecosystem types into more distinct subtypes. Ecosystem subtype is in position 4/5 in a GOLD path.", - "type": "string" - }, - "ecosystem_type": { - "description": "Ecosystem types represent things having common characteristics within the Ecosystem Category. These common characteristics based grouping is still broad but specific to the characteristics of a given environment. Ecosystem type is in position 3/5 in a GOLD path.", - "type": "string" - }, - "elev": { - "$ref": "#/$defs/QuantityValue", - "description": "Elevation of the sampling site is its height above a fixed reference point, most commonly the mean sea level. Elevation is mainly used when referring to points on the earth's surface, while altitude is used for points above the surface, such as an aircraft in flight or a spacecraft in orbit." - }, - "emsl_biosample_identifiers": { - "description": "A list of identifiers for the biosample from the EMSL database. This is used to link the biosample, as modeled by NMDC, to the biosample in the planned EMSL NEXUS database.", - "items": { - "type": "string" - }, - "type": "array" - }, - "env_broad_scale": { - "$ref": "#/$defs/ControlledIdentifiedTermValue", - "description": "Report the major environmental system the sample or specimen came from. The system(s) identified should have a coarse spatial grain, to provide the general environmental context of where the sampling was done (e.g. in the desert or a rainforest). We recommend using subclasses of EnvO\u2019s biome class: http://purl.obolibrary.org/obo/ENVO_00000428. EnvO documentation about how to use the field: https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS" - }, - "env_local_scale": { - "$ref": "#/$defs/ControlledIdentifiedTermValue", - "description": "Report the entity or entities which are in the sample or specimen\u2019s local vicinity and which you believe have significant causal influences on your sample or specimen. We recommend using EnvO terms which are of smaller spatial grain than your entry for env_broad_scale. Terms, such as anatomical sites, from other OBO Library ontologies which interoperate with EnvO (e.g. UBERON) are accepted in this field. EnvO documentation about how to use the field: https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS." - }, - "env_medium": { - "$ref": "#/$defs/ControlledIdentifiedTermValue", - "description": "Report the environmental material(s) immediately surrounding the sample or specimen at the time of sampling. We recommend using subclasses of 'environmental material' (http://purl.obolibrary.org/obo/ENVO_00010483). EnvO documentation about how to use the field: https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS . Terms from other OBO ontologies are permissible as long as they reference mass/volume nouns (e.g. air, water, blood) and not discrete, countable entities (e.g. a tree, a leaf, a table top)." - }, - "env_package": { - "$ref": "#/$defs/TextValue", - "description": "MIxS extension for reporting of measurements and observations obtained from one or more of the environments where the sample was obtained. All environmental packages listed here are further defined in separate subtables. By giving the name of the environmental package, a selection of fields can be made from the subtables and can be reported", - "pattern": "[air|built environment|host\\-associated|human\\-associated|human\\-skin|human\\-oral|human\\-gut|human\\-vaginal|hydrocarbon resources\\-cores|hydrocarbon resources\\-fluids\\/swabs|microbial mat\\/biofilm|misc environment|plant\\-associated|sediment|soil|wastewater\\/sludge|water]" - }, - "experimental_factor": { - "$ref": "#/$defs/ControlledTermValue", - "description": "Experimental factors are essentially the variable aspects of an experiment design which can be used to describe an experiment, or set of experiments, in an increasingly detailed manner. This field accepts ontology terms from Experimental Factor Ontology (EFO) and/or Ontology for Biomedical Investigations (OBI). For a browser of EFO (v 2.95) terms, please see http://purl.bioontology.org/ontology/EFO; for a browser of OBI (v 2018-02-12) terms please see http://purl.bioontology.org/ontology/OBI" - }, - "experimental_factor_other": { - "description": "Other details about your sample that you feel can't be accurately represented in the available columns.", - "type": "string" - }, - "extreme_event": { - "description": "Unusual physical events that may have affected microbial populations", - "type": "string" - }, - "fao_class": { - "$ref": "#/$defs/TextValue", - "description": "Soil classification from the FAO World Reference Database for Soil Resources. The list can be found at http://www.fao.org/nr/land/sols/soil/wrb-soil-maps/reference-groups" - }, - "filter_method": { - "description": "Type of filter used or how the sample was filtered", - "type": "string" - }, - "fire": { - "$ref": "#/$defs/TimestampValue", - "description": "Historical and/or physical evidence of fire" - }, - "flooding": { - "$ref": "#/$defs/TimestampValue", - "description": "Historical and/or physical evidence of flooding" - }, - "gaseous_environment": { - "$ref": "#/$defs/QuantityValue", - "description": "Use of conditions with differing gaseous environments; should include the name of gaseous compound, amount administered, treatment duration, interval and total experimental duration; can include multiple gaseous environment regimens" - }, - "geo_loc_name": { - "$ref": "#/$defs/TextValue", - "description": "The geographical origin of the sample as defined by the country or sea name followed by specific region name. Country or sea names should be chosen from the INSDC country list (http://insdc.org/country.html), or the GAZ ontology (http://purl.bioontology.org/ontology/GAZ)" - }, - "glucosidase_act": { - "$ref": "#/$defs/QuantityValue", - "description": "Measurement of glucosidase activity" - }, - "gold_biosample_identifiers": { - "description": "Unique identifier for a biosample submitted to GOLD that matches the NMDC submitted biosample", - "items": { - "type": "string" - }, - "pattern": "^GOLD:Gb[0-9]+$", - "type": "array" - }, - "growth_facil": { - "$ref": "#/$defs/ControlledTermValue", - "description": "Type of facility where the sampled plant was grown; controlled vocabulary: growth chamber, open top chamber, glasshouse, experimental garden, field. Alternatively use Crop Ontology (CO) terms, see http://www.cropontology.org/ontology/CO_715/Crop%20Research" - }, - "habitat": { - "type": "string" - }, - "heavy_metals": { - "$ref": "#/$defs/QuantityValue", - "description": "Heavy metals present in the sequenced sample and their concentrations. For multiple heavy metals and concentrations, add multiple copies of this field." - }, - "heavy_metals_meth": { - "$ref": "#/$defs/TextValue", - "description": "Reference or method used in determining heavy metals" - }, - "host_name": { - "type": "string" - }, - "humidity_regm": { - "$ref": "#/$defs/QuantityValue", - "description": "Information about treatment involving an exposure to varying degree of humidity; information about treatment involving use of growth hormones; should include amount of humidity administered, treatment regimen including how many times the treatment was repeated, how long each treatment lasted, and the start and end time of the entire treatment; can include multiple regimens" - }, - "id": { - "description": "An NMDC assigned unique identifier for a biosample submitted to NMDC.", - "pattern": "^(nmdc):bsm-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "igsn_biosample_identifiers": { - "description": "A list of identifiers for the biosample from the IGSN database.", - "items": { - "type": "string" - }, - "type": "array" - }, - "img_identifiers": { - "description": "A list of identifiers that relate the biosample to records in the IMG database.", - "items": { - "type": "string" - }, - "type": "array" - }, - "insdc_biosample_identifiers": { - "description": "identifiers for corresponding sample in INSDC", - "items": { - "type": "string" - }, - "pattern": "^biosample:SAM[NED]([A-Z])?[0-9]+$", - "type": "array" - }, - "isotope_exposure": { - "description": "List isotope exposure or addition applied to your sample.", - "type": "string" - }, - "lat_lon": { - "$ref": "#/$defs/GeolocationValue", - "description": "This is currently a required field but it's not clear if this should be required for human hosts" - }, - "lbc_thirty": { - "$ref": "#/$defs/QuantityValue", - "description": "lime buffer capacity, determined after 30 minute incubation" - }, - "lbceq": { - "$ref": "#/$defs/QuantityValue", - "description": "lime buffer capacity, determined at equilibrium after 5 day incubation" - }, - "light_regm": { - "$ref": "#/$defs/QuantityValue", - "description": "Information about treatment(s) involving exposure to light, including both light intensity and quality." - }, - "link_addit_analys": { - "$ref": "#/$defs/TextValue", - "description": "Link to additional analysis results performed on the sample" - }, - "link_class_info": { - "$ref": "#/$defs/TextValue", - "description": "Link to digitized soil maps or other soil classification information" - }, - "link_climate_info": { - "$ref": "#/$defs/TextValue", - "description": "Link to climate resource" - }, - "local_class": { - "$ref": "#/$defs/TextValue", - "description": "Soil classification based on local soil classification system" - }, - "local_class_meth": { - "$ref": "#/$defs/TextValue", - "description": "Reference or method used in determining the local soil classification" - }, - "location": { - "type": "string" - }, - "magnesium": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of magnesium in the sample" - }, - "manganese": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of manganese in the sample" - }, - "mean_frict_vel": { - "$ref": "#/$defs/QuantityValue", - "description": "Measurement of mean friction velocity" - }, - "mean_peak_frict_vel": { - "$ref": "#/$defs/QuantityValue", - "description": "Measurement of mean peak friction velocity" - }, - "micro_biomass_c_meth": { - "description": "Reference or method used in determining microbial biomass", - "type": "string" - }, - "micro_biomass_n_meth": { - "description": "Reference or method used in determining microbial biomass nitrogen", - "type": "string" - }, - "microbial_biomass_c": { - "description": "The part of the organic matter in the soil that constitutes living microorganisms smaller than 5-10 micrometer. If you keep this, you would need to have correction factors used for conversion to the final units", - "type": "string" - }, - "microbial_biomass_n": { - "description": "The part of the organic matter in the soil that constitutes living microorganisms smaller than 5-10 micrometer. If you keep this, you would need to have correction factors used for conversion to the final units", - "type": "string" - }, - "misc_param": { - "$ref": "#/$defs/QuantityValue", - "description": "Any other measurement performed or parameter collected, that is not listed here" - }, - "mod_date": { - "description": "The last date on which the database information was modified.", - "type": "string" - }, - "n_alkanes": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of n-alkanes; can include multiple n-alkanes" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "ncbi_taxonomy_name": { - "type": "string" - }, - "nitrate": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of nitrate in the sample" - }, - "nitrate_nitrogen": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of nitrate nitrogen in the sample" - }, - "nitrite": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of nitrite in the sample" - }, - "nitrite_nitrogen": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of nitrite nitrogen in the sample" - }, - "non_microb_biomass": { - "description": "Amount of biomass; should include the name for the part of biomass measured, e.g.insect, plant, total. Can include multiple measurements separated by ;", - "type": "string" - }, - "non_microb_biomass_method": { - "description": "Reference or method used in determining biomass", - "type": "string" - }, - "org_matter": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of organic matter" - }, - "org_nitro": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of organic nitrogen" - }, - "org_nitro_method": { - "description": "Method used for obtaining organic nitrogen", - "type": "string" - }, - "organism_count": { - "$ref": "#/$defs/QuantityValue", - "description": "Total cell count of any organism (or group of organisms) per gram, volume or area of sample, should include name of organism followed by count. The method that was used for the enumeration (e.g. qPCR, atp, mpn, etc.) Should also be provided. (example: total prokaryotes; 3.5e7 cells per ml; qpcr)" - }, - "other_treatment": { - "description": "Other treatments applied to your samples that are not applicable to the provided fields", - "type": "string" - }, - "oxy_stat_samp": { - "$ref": "#/$defs/TextValue", - "description": "Oxygenation status of sample" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "part_org_carb": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of particulate organic carbon" - }, - "perturbation": { - "$ref": "#/$defs/TextValue", - "description": "Type of perturbation, e.g. chemical administration, physical disturbance, etc., coupled with perturbation regimen including how many times the perturbation was repeated, how long each perturbation lasted, and the start and end time of the entire perturbation period; can include multiple perturbation types" - }, - "petroleum_hydrocarb": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of petroleum hydrocarbon" - }, - "ph": { - "$ref": "#/$defs/QuantityValue", - "description": "Ph measurement of the sample, or liquid portion of sample, or aqueous phase of the fluid" - }, - "ph_meth": { - "$ref": "#/$defs/TextValue", - "description": "Reference or method used in determining ph" - }, - "phaeopigments": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of phaeopigments; can include multiple phaeopigments" - }, - "phosphate": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of phosphate" - }, - "phosplipid_fatt_acid": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of phospholipid fatty acids; can include multiple values" - }, - "pool_dna_extracts": { - "$ref": "#/$defs/TextValue", - "description": "Indicate whether multiple DNA extractions were mixed. If the answer yes, the number of extracts that were pooled should be given" - }, - "potassium": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of potassium in the sample" - }, - "pressure": { - "$ref": "#/$defs/QuantityValue", - "description": "Pressure to which the sample is subject to, in atmospheres" - }, - "profile_position": { - "$ref": "#/$defs/TextValue", - "description": "Cross-sectional position in the hillslope where sample was collected.sample area position in relation to surrounding areas" - }, - "project_id": { - "description": "Proposal IDs or names associated with dataset", - "type": "string" - }, - "proport_woa_temperature": { - "type": "string" - }, - "proposal_dna": { - "type": "string" - }, - "proposal_rna": { - "type": "string" - }, - "redox_potential": { - "$ref": "#/$defs/QuantityValue", - "description": "Redox potential, measured relative to a hydrogen cell, indicating oxidation or reduction potential" - }, - "rel_to_oxygen": { - "$ref": "#/$defs/TextValue", - "description": "Is this organism an aerobe, anaerobe? Please note that aerobic and anaerobic are valid descriptors for microbial environments" - }, - "replicate_number": { - "description": "If sending biological replicates, indicate the rep number here.", - "type": "string" - }, - "rna_absorb1": { - "description": "260/280 measurement of RNA sample purity", - "type": "string" - }, - "rna_absorb2": { - "description": "260/230 measurement of RNA sample purity", - "type": "string" - }, - "rna_collect_site": { - "description": "Provide information on the site your RNA sample was collected from", - "type": "string" - }, - "rna_concentration": { - "maximum": 1000, - "minimum": 0, - "type": "string" - }, - "rna_cont_type": { - "$ref": "#/$defs/RnaContTypeEnum", - "description": "Tube or plate (96-well)" - }, - "rna_cont_well": { - "pattern": "^(?!A1|A12|H1|H12)(([A-H][1-9])|([A-H]1[0-2]))$", - "type": "string" - }, - "rna_container_id": { - "type": "string" - }, - "rna_isolate_meth": { - "description": "Describe the method/protocol/kit used to extract DNA/RNA.", - "type": "string" - }, - "rna_organisms": { - "description": "List any organisms known or suspected to grow in co-culture, as well as estimated % of the organism in that culture.", - "type": "string" - }, - "rna_project_contact": { - "type": "string" - }, - "rna_samp_id": { - "type": "string" - }, - "rna_sample_format": { - "$ref": "#/$defs/RnaSampleFormatEnum", - "description": "Solution in which the RNA sample has been suspended" - }, - "rna_sample_name": { - "description": "Give the RNA sample a name that is meaningful to you. Sample names must be unique across all JGI projects and contain a-z, A-Z, 0-9, - and _ only.", - "maximum": 2000, - "minimum": 0, - "type": "string" - }, - "rna_seq_project": { - "type": "string" - }, - "rna_seq_project_name": { - "type": "string" - }, - "rna_seq_project_pi": { - "type": "string" - }, - "rna_volume": { - "type": "string" - }, - "salinity": { - "$ref": "#/$defs/QuantityValue", - "description": "The total concentration of all dissolved salts in a liquid or solid sample. While salinity can be measured by a complete chemical analysis, this method is difficult and time consuming. More often, it is instead derived from the conductivity measurement. This is known as practical salinity. These derivations compare the specific conductance of the sample to a salinity standard such as seawater." - }, - "salinity_category": { - "description": "Categorcial description of the sample's salinity. Examples: halophile, halotolerant, hypersaline, huryhaline", - "type": "string" - }, - "salinity_meth": { - "$ref": "#/$defs/TextValue", - "description": "Reference or method used in determining salinity" - }, - "samp_collec_method": { - "description": "The method employed for collecting the sample.", - "type": "string" - }, - "samp_mat_process": { - "$ref": "#/$defs/ControlledTermValue", - "description": "A brief description of any processing applied to the sample during or after retrieving the sample from environment, or a link to the relevant protocol(s) performed." - }, - "samp_name": { - "description": "A local identifier or name that for the material sample used for extracting nucleic acids, and subsequent sequencing. It can refer either to the original material collected or to any derived sub-samples. It can have any format, but we suggest that you make it concise, unique and consistent within your lab, and as informative as possible. INSDC requires every sample name from a single Submitter to be unique. Use of a globally unique identifier for the field source_mat_id is recommended in addition to sample_name.", - "type": "string" - }, - "samp_size": { - "$ref": "#/$defs/QuantityValue", - "description": "The total amount or size (volume (ml), mass (g) or area (m2) ) of sample collected." - }, - "samp_store_dur": { - "$ref": "#/$defs/TextValue", - "description": "Duration for which the sample was stored" - }, - "samp_store_loc": { - "$ref": "#/$defs/TextValue", - "description": "Location at which sample was stored, usually name of a specific freezer/room" - }, - "samp_store_temp": { - "$ref": "#/$defs/QuantityValue", - "description": "Temperature at which sample was stored, e.g. -80 degree Celsius" - }, - "samp_vol_we_dna_ext": { - "$ref": "#/$defs/QuantityValue", - "description": "Volume (ml) or mass (g) of total collected sample processed for DNA extraction. Note: total sample collected should be entered under the term Sample Size (mixs:0000001)." - }, - "sample_collection_site": { - "type": "string" - }, - "sample_link": { - "description": "JsonObj()", - "items": { - "type": "string" - }, - "type": "array" - }, - "sample_shipped": { - "description": "The total amount or size (volume (ml), mass (g) or area (m2) ) of sample sent to EMSL", - "type": "string" - }, - "sample_type": { - "$ref": "#/$defs/SampleTypeEnum", - "description": "Type of sample being submitted" - }, - "season_precpt": { - "$ref": "#/$defs/QuantityValue", - "description": "The average of all seasonal precipitation values known, or an estimated equivalent value derived by such methods as regional indexes or Isohyetal maps." - }, - "season_temp": { - "$ref": "#/$defs/QuantityValue", - "description": "Mean seasonal temperature" - }, - "sieving": { - "$ref": "#/$defs/QuantityValue", - "description": "Collection design of pooled samples and/or sieve size and amount of sample sieved" - }, - "size_frac_low": { - "$ref": "#/$defs/QuantityValue", - "description": "Refers to the mesh/pore size used to pre-filter/pre-sort the sample. Materials larger than the size threshold are excluded from the sample" - }, - "size_frac_up": { - "$ref": "#/$defs/QuantityValue", - "description": "Refers to the mesh/pore size used to retain the sample. Materials smaller than the size threshold are excluded from the sample" - }, - "slope_aspect": { - "$ref": "#/$defs/QuantityValue", - "description": "The direction a slope faces. While looking down a slope use a compass to record the direction you are facing (direction or degrees); e.g., nw or 315 degrees. This measure provides an indication of sun and wind exposure that will influence soil temperature and evapotranspiration." - }, - "slope_gradient": { - "$ref": "#/$defs/QuantityValue", - "description": "Commonly called 'slope'. The angle between ground surface and a horizontal line (in percent). This is the direction that overland water would flow. This measure is usually taken with a hand level meter or clinometer" - }, - "sodium": { - "$ref": "#/$defs/QuantityValue", - "description": "Sodium concentration in the sample" - }, - "soil_type": { - "$ref": "#/$defs/TextValue", - "description": "Description of the soil type or classification. This field accepts terms under soil (http://purl.obolibrary.org/obo/ENVO_00001998). Multiple terms can be separated by pipes." - }, - "soil_type_meth": { - "$ref": "#/$defs/TextValue", - "description": "Reference or method used in determining soil series name or other lower-level classification" - }, - "soluble_iron_micromol": { - "type": "string" - }, - "source_mat_id": { - "$ref": "#/$defs/TextValue", - "description": "A unique identifier assigned to a material sample (as defined by http://rs.tdwg.org/dwc/terms/materialSampleID, and as opposed to a particular digital record of a material sample) used for extracting nucleic acids, and subsequent sequencing. The identifier can refer either to the original material collected or to any derived sub-samples. The INSDC qualifiers /specimen_voucher, /bio_material, or /culture_collection may or may not share the same value as the source_mat_id field. For instance, the /specimen_voucher qualifier and source_mat_id may both contain 'UAM:Herps:14' , referring to both the specimen voucher and sampled tissue with the same identifier. However, the /culture_collection qualifier may refer to a value from an initial culture (e.g. ATCC:11775) while source_mat_id would refer to an identifier from some derived culture from which the nucleic acids were extracted (e.g. xatc123 or ark:/2154/R2)." - }, - "specific_ecosystem": { - "description": "Specific ecosystems represent specific features of the environment like aphotic zone in an ocean or gastric mucosa within a host digestive system. Specific ecosystem is in position 5/5 in a GOLD path.", - "type": "string" - }, - "start_date_inc": { - "description": "Date the incubation was started. Only relevant for incubation samples.", - "type": "string" - }, - "start_time_inc": { - "description": "Time the incubation was started. Only relevant for incubation samples.", - "type": "string" - }, - "store_cond": { - "$ref": "#/$defs/TextValue", - "description": "Explain how and for how long the soil sample was stored before DNA extraction (fresh/frozen/other)." - }, - "subsurface_depth": { - "$ref": "#/$defs/QuantityValue" - }, - "sulfate": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of sulfate in the sample" - }, - "sulfide": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of sulfide in the sample" - }, - "technical_reps": { - "description": "If sending multiple technical replicates of the same sample, indicate how many replicates are being sent", - "type": "string" - }, - "temp": { - "$ref": "#/$defs/QuantityValue", - "description": "Temperature of the sample at the time of sampling." - }, - "tidal_stage": { - "$ref": "#/$defs/TextValue", - "description": "Stage of tide" - }, - "tillage": { - "$ref": "#/$defs/TextValue", - "description": "Note method(s) used for tilling" - }, - "tot_carb": { - "$ref": "#/$defs/QuantityValue", - "description": "Total carbon content" - }, - "tot_depth_water_col": { - "$ref": "#/$defs/QuantityValue", - "description": "Measurement of total depth of water column" - }, - "tot_diss_nitro": { - "$ref": "#/$defs/QuantityValue", - "description": "Total dissolved nitrogen concentration, reported as nitrogen, measured by: total dissolved nitrogen = NH4 + NO3NO2 + dissolved organic nitrogen" - }, - "tot_nitro_cont_meth": { - "description": "Reference or method used in determining the total nitrogen", - "type": "string" - }, - "tot_nitro_content": { - "$ref": "#/$defs/QuantityValue", - "description": "Total nitrogen content of the sample" - }, - "tot_org_c_meth": { - "$ref": "#/$defs/TextValue", - "description": "Reference or method used in determining total organic carbon" - }, - "tot_org_carb": { - "$ref": "#/$defs/QuantityValue", - "description": "Definition for soil: total organic carbon content of the soil, definition otherwise: total organic carbon content" - }, - "tot_phosp": { - "$ref": "#/$defs/QuantityValue", - "description": "Total phosphorus concentration in the sample, calculated by: total phosphorus = total dissolved phosphorus + particulate phosphorus" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "water_cont_soil_meth": { - "description": "Reference or method used in determining the water content of soil", - "type": "string" - }, - "water_content": { - "$ref": "#/$defs/QuantityValue", - "description": "Water content measurement" - }, - "watering_regm": { - "$ref": "#/$defs/QuantityValue", - "description": "Information about treatment involving an exposure to watering frequencies, treatment regimen including how many times the treatment was repeated, how long each treatment lasted, and the start and end time of the entire treatment; can include multiple regimens" - }, - "zinc": { - "$ref": "#/$defs/QuantityValue", - "description": "Concentration of zinc in the sample" - } - }, - "required": [ - "part_of", - "id", - "env_broad_scale", - "env_local_scale", - "env_medium" - ], - "title": "Biosample", - "type": "object" - }, - "BiosampleCategoryEnum": { - "description": "Funding-based, sample location-based, or experimental method-based defined categories", - "enum": [ - "LTER", - "SIP", - "SFA", - "FICUS", - "NEON" - ], - "title": "BiosampleCategoryEnum", - "type": "string" - }, - "BiosampleProcessing": { - "additionalProperties": false, - "description": "A process that takes one or more biosamples as inputs and generates one or as outputs. Examples of outputs include samples cultivated from another sample or data objects created by instruments runs.", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):bsmprc-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - } - }, - "required": [ - "id" - ], - "title": "BiosampleProcessing", - "type": "object" - }, - "BioticRelationshipEnum": { - "description": "", - "enum": [ - "free living", - "parasite", - "commensal", - "symbiont" - ], - "title": "BioticRelationshipEnum", - "type": "string" - }, - "BooleanValue": { - "additionalProperties": false, - "description": "A value that is a boolean", - "properties": { - "has_boolean_value": { - "description": "Links a quantity value to a boolean", - "type": "boolean" - }, - "has_raw_value": { - "description": "The value that was specified for an annotation in raw form, i.e. a string. E.g. \"2 cm\" or \"2-4 cm\"", - "type": "string" - }, - "was_generated_by": { - "type": "string" - } - }, - "title": "BooleanValue", - "type": "object" - }, - "BuildDocsEnum": { - "description": "", - "enum": [ - "building information model", - "commissioning report", - "complaint logs", - "contract administration", - "cost estimate", - "janitorial schedules or logs", - "maintenance plans", - "schedule", - "sections", - "shop drawings", - "submittals", - "ventilation system", - "windows" - ], - "title": "BuildDocsEnum", - "type": "string" - }, - "BuildOccupTypeEnum": { - "description": "", - "enum": [ - "office", - "market", - "restaurant", - "residence", - "school", - "residential", - "commercial", - "low rise", - "high rise", - "wood framed", - "health care", - "airport", - "sports complex" - ], - "title": "BuildOccupTypeEnum", - "type": "string" - }, - "BuildingSettingEnum": { - "description": "", - "enum": [ - "urban", - "suburban", - "exurban", - "rural" - ], - "title": "BuildingSettingEnum", - "type": "string" - }, - "CeilCondEnum": { - "description": "", - "enum": [ - "new", - "visible wear", - "needs repair", - "damaged", - "rupture" - ], - "title": "CeilCondEnum", - "type": "string" - }, - "CeilFinishMatEnum": { - "description": "", - "enum": [ - "drywall", - "mineral fibre", - "tiles", - "PVC", - "plasterboard", - "metal", - "fiberglass", - "stucco", - "mineral wool/calcium silicate", - "wood" - ], - "title": "CeilFinishMatEnum", - "type": "string" - }, - "CeilTextureEnum": { - "description": "", - "enum": [ - "crows feet", - "crows-foot stomp", - "double skip", - "hawk and trowel", - "knockdown", - "popcorn", - "orange peel", - "rosebud stomp", - "Santa-Fe texture", - "skip trowel", - "smooth", - "stomp knockdown", - "swirl" - ], - "title": "CeilTextureEnum", - "type": "string" - }, - "CeilTypeEnum": { - "description": "", - "enum": [ - "cathedral", - "dropped", - "concave", - "barrel-shaped", - "coffered", - "cove", - "stretched" - ], - "title": "CeilTypeEnum", - "type": "string" - }, - "ChemicalEntity": { - "additionalProperties": false, - "description": "An atom or molecule that can be represented with a chemical formula. Include lipids, glycans, natural products, drugs. There may be different terms for distinct acid-base forms, protonation states", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - } - }, - "required": [ - "id" - ], - "title": "ChemicalEntity", - "type": "object" - }, - "CollectingBiosamplesFromSite": { - "additionalProperties": false, - "description": "", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "has_inputs": { - "items": { - "type": "string" - }, - "type": "array" - }, - "has_outputs": { - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):clsite-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "participating_agent": { - "$ref": "#/$defs/Agent" - } - }, - "required": [ - "has_inputs", - "has_outputs", - "id" - ], - "title": "CollectingBiosamplesFromSite", - "type": "object" - }, - "ContainerTypeEnum": { - "description": "", - "enum": [ - "screw_top_conical" - ], - "title": "ContainerTypeEnum", - "type": "string" - }, - "ControlledIdentifiedTermValue": { - "additionalProperties": false, - "description": "A controlled term or class from an ontology, requiring the presence of term with an id", - "properties": { - "has_raw_value": { - "description": "The value that was specified for an annotation in raw form, i.e. a string. E.g. \"2 cm\" or \"2-4 cm\"", - "type": "string" - }, - "term": { - "$ref": "#/$defs/OntologyClass", - "description": "pointer to an ontology class" - }, - "was_generated_by": { - "type": "string" - } - }, - "required": [ - "term" - ], - "title": "ControlledIdentifiedTermValue", - "type": "object" - }, - "ControlledTermValue": { - "additionalProperties": false, - "description": "A controlled term or class from an ontology", - "properties": { - "has_raw_value": { - "description": "The value that was specified for an annotation in raw form, i.e. a string. E.g. \"2 cm\" or \"2-4 cm\"", - "type": "string" - }, - "term": { - "$ref": "#/$defs/OntologyClass", - "description": "pointer to an ontology class" - }, - "was_generated_by": { - "type": "string" - } - }, - "title": "ControlledTermValue", - "type": "object" - }, - "CreditAssociation": { - "additionalProperties": false, - "description": "This class supports binding associated researchers to studies. There will be at least a slot for a CRediT Contributor Role (https://casrai.org/credit/) and for a person value Specifically see the associated researchers tab on the NMDC_SampleMetadata-V4_CommentsForUpdates at https://docs.google.com/spreadsheets/d/1INlBo5eoqn2efn4H2P2i8rwRBtnbDVTqXrochJEAPko/edit#gid=0", - "properties": { - "applied_role": { - "$ref": "#/$defs/CreditEnum" - }, - "applied_roles": { - "items": { - "$ref": "#/$defs/CreditEnum" - }, - "type": "array" - }, - "applies_to_person": { - "$ref": "#/$defs/PersonValue" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - } - }, - "required": [ - "applies_to_person", - "applied_roles" - ], - "title": "CreditAssociation", - "type": "object" - }, - "CreditEnum": { - "description": "", - "enum": [ - "Conceptualization", - "Data curation", - "Formal Analysis", - "Funding acquisition", - "Investigation", - "Methodology", - "Project administration", - "Resources", - "Software", - "Supervision", - "Validation", - "Visualization", - "Writing original draft", - "Writing review and editing", - "Principal Investigator", - "Submitter" - ], - "title": "CreditEnum", - "type": "string" - }, - "CurLandUseEnum": { - "description": "", - "enum": [ - "cities", - "farmstead", - "industrial areas", - "roads/railroads", - "rock", - "sand", - "gravel", - "mudflats", - "salt flats", - "badlands", - "permanent snow or ice", - "saline seeps", - "mines/quarries", - "oil waste areas", - "small grains", - "row crops", - "vegetable crops", - "horticultural plants (e.g. tulips)", - "marshlands (grass,sedges,rushes)", - "tundra (mosses,lichens)", - "rangeland", - "pastureland (grasslands used for livestock grazing)", - "hayland", - "meadows (grasses,alfalfa,fescue,bromegrass,timothy)", - "shrub land (e.g. mesquite,sage-brush,creosote bush,shrub oak,eucalyptus)", - "successional shrub land (tree saplings,hazels,sumacs,chokecherry,shrub dogwoods,blackberries)", - "shrub crops (blueberries,nursery ornamentals,filberts)", - "vine crops (grapes)", - "conifers (e.g. pine,spruce,fir,cypress)", - "hardwoods (e.g. oak,hickory,elm,aspen)", - "intermixed hardwood and conifers", - "tropical (e.g. mangrove,palms)", - "rainforest (evergreen forest receiving greater than 406 cm annual rainfall)", - "swamp (permanent or semi-permanent water body dominated by woody plants)", - "crop trees (nuts,fruit,christmas trees,nursery trees)" - ], - "title": "CurLandUseEnum", - "type": "string" - }, - "DataObject": { - "additionalProperties": false, - "description": "An object that primarily consists of symbols that represent information. Files, records, and omics data are examples of data objects.", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "compression_type": { - "description": "If provided, specifies the compression type", - "type": "string" - }, - "data_object_type": { - "$ref": "#/$defs/FileTypeEnum", - "description": "The type of file represented by the data object." - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "file_size_bytes": { - "description": "Size of the file in bytes", - "type": "integer" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):dobj-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "md5_checksum": { - "description": "MD5 checksum of file (pre-compressed)", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "url": { - "type": "string" - }, - "was_generated_by": { - "type": "string" - } - }, - "required": [ - "name", - "description" - ], - "title": "DataObject", - "type": "object" - }, - "Database": { - "additionalProperties": false, - "description": "An abstract holder for any set of metadata and data. It does not need to correspond to an actual managed database top level holder class. When translated to JSON-Schema this is the 'root' object. It should contain pointers to other objects of interest", - "properties": { - "activity_set": { - "description": "This property links a database object to the set of workflow activities.", - "items": { - "$ref": "#/$defs/WorkflowExecutionActivity" - }, - "type": "array" - }, - "biosample_set": { - "description": "This property links a database object to the set of samples within it.", - "items": { - "$ref": "#/$defs/Biosample" - }, - "type": "array" - }, - "collecting_biosamples_from_site_set": { - "items": { - "$ref": "#/$defs/CollectingBiosamplesFromSite" - }, - "type": "array" - }, - "data_object_set": { - "description": "This property links a database object to the set of data objects within it.", - "items": { - "$ref": "#/$defs/DataObject" - }, - "type": "array" - }, - "dissolving_activity_set": { - "items": { - "$ref": "#/$defs/DissolvingActivity" - }, - "type": "array" - }, - "field_research_site_set": { - "items": { - "$ref": "#/$defs/FieldResearchSite" - }, - "type": "array" - }, - "functional_annotation_set": { - "description": "This property links a database object to the set of all functional annotations", - "items": { - "$ref": "#/$defs/FunctionalAnnotation" - }, - "type": "array" - }, - "genome_feature_set": { - "description": "This property links a database object to the set of all features", - "items": { - "$ref": "#/$defs/GenomeFeature" - }, - "type": "array" - }, - "mags_activity_set": { - "description": "This property links a database object to the set of MAGs analysis activities.", - "items": { - "$ref": "#/$defs/MagsAnalysisActivity" - }, - "type": "array" - }, - "material_sample_set": { - "items": { - "$ref": "#/$defs/MaterialSample" - }, - "type": "array" - }, - "material_sampling_activity_set": { - "items": { - "$ref": "#/$defs/MaterialSamplingActivity" - }, - "type": "array" - }, - "metabolomics_analysis_activity_set": { - "description": "This property links a database object to the set of metabolomics analysis activities.", - "items": { - "$ref": "#/$defs/MetabolomicsAnalysisActivity" - }, - "type": "array" - }, - "metagenome_annotation_activity_set": { - "description": "This property links a database object to the set of metagenome annotation activities.", - "items": { - "$ref": "#/$defs/MetagenomeAnnotationActivity" - }, - "type": "array" - }, - "metagenome_assembly_set": { - "description": "This property links a database object to the set of metagenome assembly activities.", - "items": { - "$ref": "#/$defs/MetagenomeAssembly" - }, - "type": "array" - }, - "metaproteomics_analysis_activity_set": { - "description": "This property links a database object to the set of metaproteomics analysis activities.", - "items": { - "$ref": "#/$defs/MetaproteomicsAnalysisActivity" - }, - "type": "array" - }, - "metatranscriptome_activity_set": { - "description": "TODO", - "items": { - "$ref": "#/$defs/MetatranscriptomeActivity" - }, - "type": "array" - }, - "nom_analysis_activity_set": { - "description": "This property links a database object to the set of natural organic matter (NOM) analysis activities.", - "items": { - "$ref": "#/$defs/NomAnalysisActivity" - }, - "type": "array" - }, - "omics_processing_set": { - "description": "This property links a database object to the set of omics processings within it.", - "items": { - "$ref": "#/$defs/OmicsProcessing" - }, - "type": "array" - }, - "reaction_activity_set": { - "items": { - "$ref": "#/$defs/ReactionActivity" - }, - "type": "array" - }, - "read_based_taxonomy_analysis_activity_set": { - "description": "This property links a database object to the set of read based analysis activities.", - "items": { - "$ref": "#/$defs/ReadBasedTaxonomyAnalysisActivity" - }, - "type": "array" - }, - "read_qc_analysis_activity_set": { - "description": "This property links a database object to the set of read QC analysis activities.", - "items": { - "$ref": "#/$defs/ReadQcAnalysisActivity" - }, - "type": "array" - }, - "study_set": { - "description": "This property links a database object to the set of studies within it.", - "items": { - "$ref": "#/$defs/Study" - }, - "type": "array" - } - }, - "title": "Database", - "type": "object" - }, - "DeposEnvEnum": { - "description": "", - "enum": [ - "Continental - Alluvial", - "Continental - Aeolian", - "Continental - Fluvial", - "Continental - Lacustrine", - "Transitional - Deltaic", - "Transitional - Tidal", - "Transitional - Lagoonal", - "Transitional - Beach", - "Transitional - Lake", - "Marine - Shallow", - "Marine - Deep", - "Marine - Reef", - "Other - Evaporite", - "Other - Glacial", - "Other - Volcanic", - "other" - ], - "title": "DeposEnvEnum", - "type": "string" - }, - "DeviceTypeEnum": { - "description": "", - "enum": [ - "orbital_shaker", - "thermomixer" - ], - "title": "DeviceTypeEnum", - "type": "string" - }, - "DissolvingActivity": { - "additionalProperties": false, - "description": "", - "properties": { - "dissolution_aided_by": { - "$ref": "#/$defs/LabDevice" - }, - "dissolution_reagent": { - "$ref": "#/$defs/SolventEnum" - }, - "dissolution_volume": { - "$ref": "#/$defs/QuantityValue" - }, - "dissolved_in": { - "$ref": "#/$defs/MaterialContainer" - }, - "material_input": { - "type": "string" - }, - "material_output": { - "type": "string" - } - }, - "title": "DissolvingActivity", - "type": "object" - }, - "DnaContTypeEnum": { - "description": "", - "enum": [ - "plate", - "tube" - ], - "title": "DnaContTypeEnum", - "type": "string" - }, - "DnaDnaseEnum": { - "description": "", - "enum": [ - "no", - "yes" - ], - "title": "DnaDnaseEnum", - "type": "string" - }, - "DnaSampleFormatEnum": { - "description": "", - "enum": [ - "10 mM Tris-HCl", - "DNAStable", - "Ethanol", - "Low EDTA TE", - "MDA reaction buffer", - "PBS", - "Pellet", - "RNAStable", - "TE", - "Water", - "Gentegra-DNA", - "Gentegra-RNA" - ], - "title": "DnaSampleFormatEnum", - "type": "string" - }, - "DnaseRnaEnum": { - "description": "", - "enum": [ - "no", - "yes" - ], - "title": "DnaseRnaEnum", - "type": "string" - }, - "DoorCompTypeEnum": { - "description": "", - "enum": [ - "metal covered", - "revolving", - "sliding", - "telescopic" - ], - "title": "DoorCompTypeEnum", - "type": "string" - }, - "DoorCondEnum": { - "description": "", - "enum": [ - "damaged", - "needs repair", - "new", - "rupture", - "visible wear" - ], - "title": "DoorCondEnum", - "type": "string" - }, - "DoorDirectEnum": { - "description": "", - "enum": [ - "inward", - "outward", - "sideways" - ], - "title": "DoorDirectEnum", - "type": "string" - }, - "DoorLocEnum": { - "description": "", - "enum": [ - "north", - "south", - "east", - "west" - ], - "title": "DoorLocEnum", - "type": "string" - }, - "DoorMatEnum": { - "description": "", - "enum": [ - "aluminum", - "cellular PVC", - "engineered plastic", - "fiberboard", - "fiberglass", - "metal", - "thermoplastic alloy", - "vinyl", - "wood", - "wood/plastic composite" - ], - "title": "DoorMatEnum", - "type": "string" - }, - "DoorMoveEnum": { - "description": "", - "enum": [ - "collapsible", - "folding", - "revolving", - "rolling shutter", - "sliding", - "swinging" - ], - "title": "DoorMoveEnum", - "type": "string" - }, - "DoorTypeEnum": { - "description": "", - "enum": [ - "composite", - "metal", - "wooden" - ], - "title": "DoorTypeEnum", - "type": "string" - }, - "DoorTypeMetalEnum": { - "description": "", - "enum": [ - "collapsible", - "corrugated steel", - "hollow", - "rolling shutters", - "steel plate" - ], - "title": "DoorTypeMetalEnum", - "type": "string" - }, - "DoorTypeWoodEnum": { - "description": "", - "enum": [ - "bettened and ledged", - "battened", - "ledged and braced", - "ledged and framed", - "ledged, braced and frame", - "framed and paneled", - "glashed or sash", - "flush", - "louvered", - "wire gauged" - ], - "title": "DoorTypeWoodEnum", - "type": "string" - }, - "DrainageClassEnum": { - "description": "", - "enum": [ - "very poorly", - "poorly", - "somewhat poorly", - "moderately well", - "well", - "excessively drained" - ], - "title": "DrainageClassEnum", - "type": "string" - }, - "DrawingsEnum": { - "description": "", - "enum": [ - "operation", - "as built", - "construction", - "bid", - "design", - "building navigation map", - "diagram", - "sketch" - ], - "title": "DrawingsEnum", - "type": "string" - }, - "EnvironmentalMaterialTerm": { - "additionalProperties": false, - "description": "", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - } - }, - "required": [ - "id" - ], - "title": "EnvironmentalMaterialTerm", - "type": "object" - }, - "ExtWallOrientEnum": { - "description": "", - "enum": [ - "north", - "south", - "east", - "west", - "northeast", - "southeast", - "southwest", - "northwest" - ], - "title": "ExtWallOrientEnum", - "type": "string" - }, - "ExtWindowOrientEnum": { - "description": "", - "enum": [ - "north", - "south", - "east", - "west", - "northeast", - "southeast", - "southwest", - "northwest" - ], - "title": "ExtWindowOrientEnum", - "type": "string" - }, - "FaoClassEnum": { - "description": "", - "enum": [ - "Acrisols", - "Andosols", - "Arenosols", - "Cambisols", - "Chernozems", - "Ferralsols", - "Fluvisols", - "Gleysols", - "Greyzems", - "Gypsisols", - "Histosols", - "Kastanozems", - "Lithosols", - "Luvisols", - "Nitosols", - "Phaeozems", - "Planosols", - "Podzols", - "Podzoluvisols", - "Rankers", - "Regosols", - "Rendzinas", - "Solonchaks", - "Solonetz", - "Vertisols", - "Yermosols" - ], - "title": "FaoClassEnum", - "type": "string" - }, - "FieldResearchSite": { - "additionalProperties": false, - "description": "A site, outside of a laboratory, from which biosamples may be collected.", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):frsite-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - } - }, - "required": [ - "id" - ], - "title": "FieldResearchSite", - "type": "object" - }, - "FileTypeEnum": { - "description": "", - "enum": [ - "Metagenome Raw Reads", - "FT ICR-MS Analysis Results", - "GC-MS Metabolomics Results", - "Metaproteomics Workflow Statistics", - "Protein Report", - "Peptide Report", - "Unfiltered Metaproteomics Results", - "Read Count and RPKM", - "QC non-rRNA R2", - "QC non-rRNA R1", - "Metagenome Bins", - "CheckM Statistics", - "GOTTCHA2 Krona Plot", - "GOTTCHA2 Classification Report", - "GOTTCHA2 Report Full", - "Kraken2 Krona Plot", - "Centrifuge Krona Plot", - "Centrifuge output report file", - "Kraken2 Classification Report", - "Kraken2 Taxonomic Classification", - "Centrifuge Classification Report", - "Centrifuge Taxonomic Classification", - "Structural Annotation GFF", - "Functional Annotation GFF", - "Annotation Amino Acid FASTA", - "Annotation Enzyme Commission", - "Annotation KEGG Orthology", - "Assembly Coverage BAM", - "Assembly AGP", - "Assembly Scaffolds", - "Assembly Contigs", - "Assembly Coverage Stats", - "Filtered Sequencing Reads", - "QC Statistics", - "TIGRFam Annotation GFF", - "CRT Annotation GFF", - "Genmark Annotation GFF", - "Prodigal Annotation GFF", - "TRNA Annotation GFF", - "Misc Annotation GFF", - "RFAM Annotation GFF", - "TMRNA Annotation GFF", - "KO_EC Annotation GFF", - "Product Names", - "Gene Phylogeny tsv", - "Crisprt Terms", - "Clusters of Orthologous Groups (COG) Annotation GFF", - "CATH FunFams (Functional Families) Annotation GFF", - "SUPERFam Annotation GFF", - "SMART Annotation GFF", - "Pfam Annotation GFF", - "Direct Infusion FT ICR-MS Raw Data" - ], - "title": "FileTypeEnum", - "type": "string" - }, - "FilterTypeEnum": { - "description": "", - "enum": [ - "particulate air filter", - "chemical air filter", - "low-MERV pleated media", - "HEPA", - "electrostatic", - "gas-phase or ultraviolet air treatments" - ], - "title": "FilterTypeEnum", - "type": "string" - }, - "FloorCondEnum": { - "description": "", - "enum": [ - "new", - "visible wear", - "needs repair", - "damaged", - "rupture" - ], - "title": "FloorCondEnum", - "type": "string" - }, - "FloorFinishMatEnum": { - "description": "", - "enum": [ - "tile", - "wood strip or parquet", - "carpet", - "rug", - "laminate wood", - "lineoleum", - "vinyl composition tile", - "sheet vinyl", - "stone", - "bamboo", - "cork", - "terrazo", - "concrete", - "none", - "sealed", - "clear finish", - "paint", - "none or unfinished" - ], - "title": "FloorFinishMatEnum", - "type": "string" - }, - "FloorStrucEnum": { - "description": "", - "enum": [ - "balcony", - "floating floor", - "glass floor", - "raised floor", - "sprung floor", - "wood-framed", - "concrete" - ], - "title": "FloorStrucEnum", - "type": "string" - }, - "FloorWaterMoldEnum": { - "description": "", - "enum": [ - "mold odor", - "wet floor", - "water stains", - "wall discoloration", - "floor discoloration", - "ceiling discoloration", - "peeling paint or wallpaper", - "bulging walls", - "condensation" - ], - "title": "FloorWaterMoldEnum", - "type": "string" - }, - "FreqCleanEnum": { - "description": "", - "enum": [ - "Daily", - "Weekly", - "Monthly", - "Quarterly", - "Annually", - "other" - ], - "title": "FreqCleanEnum", - "type": "string" - }, - "FunctionalAnnotation": { - "additionalProperties": false, - "description": "An assignment of a function term (e.g. reaction or pathway) that is executed by a gene product, or which the gene product plays an active role in. Functional annotations can be assigned manually by curators, or automatically in workflows. In the context of NMDC, all function annotation is performed automatically, typically using HMM or Blast type methods", - "properties": { - "has_function": { - "pattern": "^(KEGG_PATHWAY:\\w{2,4}\\d{5}|KEGG.REACTION:R\\d+|RHEA:\\d{5}|MetaCyc:[A-Za-z0-9+_.%-:]+|EC:\\d{1,2}(\\.\\d{0,3}){0,3}|GO:\\d{7}|MetaNetX:(MNXR\\d+|EMPTY)|SEED:\\w+|KEGG\\.ORTHOLOGY:K\\d+|EGGNOG:\\w+|PFAM:PF\\d{5}|TIGRFAM:TIGR\\d+|SUPFAM:\\w+|CATH:[1-6]\\.[0-9]+\\.[0-9]+\\.[0-9]+|PANTHER.FAMILY:PTHR\\d{5}(\\:SF\\d{1,3})?)$", - "type": "string" - }, - "subject": { - "type": "string" - }, - "was_generated_by": { - "description": "provenance for the annotation.", - "type": "string" - } - }, - "title": "FunctionalAnnotation", - "type": "object" - }, - "FurnitureEnum": { - "description": "", - "enum": [ - "cabinet", - "chair", - "desks" - ], - "title": "FurnitureEnum", - "type": "string" - }, - "GenderRestroomEnum": { - "description": "", - "enum": [ - "all gender", - "female", - "gender neurtral", - "male", - "male and female", - "unisex" - ], - "title": "GenderRestroomEnum", - "type": "string" - }, - "GeneProduct": { - "additionalProperties": false, - "description": "A molecule encoded by a gene that has an evolved function", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - } - }, - "required": [ - "id" - ], - "title": "GeneProduct", - "type": "object" - }, - "GenomeFeature": { - "additionalProperties": false, - "description": "A feature localized to an interval along a genome", - "title": "GenomeFeature", - "type": "object" - }, - "GeolocationValue": { - "additionalProperties": false, - "description": "A normalized value for a location on the earth's surface", - "properties": { - "has_raw_value": { - "description": "The raw value for a geolocation should follow {lat} {long}", - "type": "string" - }, - "latitude": { - "description": "latitude", - "type": "number" - }, - "longitude": { - "description": "longitude", - "type": "number" - }, - "was_generated_by": { - "type": "string" - } - }, - "title": "GeolocationValue", - "type": "object" - }, - "GrowthHabitEnum": { - "description": "", - "enum": [ - "erect", - "semi-erect", - "spreading", - "prostrate" - ], - "title": "GrowthHabitEnum", - "type": "string" - }, - "HandidnessEnum": { - "description": "", - "enum": [ - "ambidexterity", - "left handedness", - "mixed-handedness", - "right handedness" - ], - "title": "HandidnessEnum", - "type": "string" - }, - "HcProducedEnum": { - "description": "", - "enum": [ - "Oil", - "Gas-Condensate", - "Gas", - "Bitumen", - "Coalbed Methane", - "other" - ], - "title": "HcProducedEnum", - "type": "string" - }, - "HcrEnum": { - "description": "", - "enum": [ - "Oil Reservoir", - "Gas Reservoir", - "Oil Sand", - "Coalbed", - "Shale", - "Tight Oil Reservoir", - "Tight Gas Reservoir", - "other" - ], - "title": "HcrEnum", - "type": "string" - }, - "HcrGeolAgeEnum": { - "description": "", - "enum": [ - "Archean", - "Cambrian", - "Carboniferous", - "Cenozoic", - "Cretaceous", - "Devonian", - "Jurassic", - "Mesozoic", - "Neogene", - "Ordovician", - "Paleogene", - "Paleozoic", - "Permian", - "Precambrian", - "Proterozoic", - "Silurian", - "Triassic", - "other" - ], - "title": "HcrGeolAgeEnum", - "type": "string" - }, - "HeatCoolTypeEnum": { - "description": "", - "enum": [ - "radiant system", - "heat pump", - "forced air system", - "steam forced heat", - "wood stove" - ], - "title": "HeatCoolTypeEnum", - "type": "string" - }, - "HeatDelivLocEnum": { - "description": "", - "enum": [ - "north", - "south", - "east", - "west" - ], - "title": "HeatDelivLocEnum", - "type": "string" - }, - "HorizonEnum": { - "description": "", - "enum": [ - "O horizon", - "A horizon", - "E horizon", - "B horizon", - "C horizon", - "R layer", - "Permafrost" - ], - "title": "HorizonEnum", - "type": "string" - }, - "HostSexEnum": { - "description": "", - "enum": [ - "female", - "hermaphrodite", - "male", - "neuter" - ], - "title": "HostSexEnum", - "type": "string" - }, - "ImageValue": { - "additionalProperties": false, - "description": "An attribute value representing an image.", - "properties": { - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "display_order": { - "description": "When rendering information, this attribute to specify the order in which the information should be rendered.", - "type": "string" - }, - "has_raw_value": { - "description": "The value that was specified for an annotation in raw form, i.e. a string. E.g. \"2 cm\" or \"2-4 cm\"", - "type": "string" - }, - "url": { - "type": "string" - }, - "was_generated_by": { - "type": "string" - } - }, - "title": "ImageValue", - "type": "object" - }, - "IndoorSpaceEnum": { - "description": "", - "enum": [ - "bedroom", - "office", - "bathroom", - "foyer", - "kitchen", - "locker room", - "hallway", - "elevator" - ], - "title": "IndoorSpaceEnum", - "type": "string" - }, - "IndoorSurfEnum": { - "description": "", - "enum": [ - "cabinet", - "ceiling", - "counter top", - "door", - "shelving", - "vent cover", - "window", - "wall" - ], - "title": "IndoorSurfEnum", - "type": "string" - }, - "Instrument": { - "additionalProperties": false, - "description": "A material entity that is designed to perform a function in a scientific investigation, but is not a reagent[OBI].", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):inst-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - } - }, - "required": [ - "id" - ], - "title": "Instrument", - "type": "object" - }, - "IntWallCondEnum": { - "description": "", - "enum": [ - "new", - "visible wear", - "needs repair", - "damaged", - "rupture" - ], - "title": "IntWallCondEnum", - "type": "string" - }, - "IntegerValue": { - "additionalProperties": false, - "description": "A value that is an integer", - "properties": { - "has_numeric_value": { - "description": "Links a quantity value to a number", - "type": "number" - }, - "has_raw_value": { - "description": "The value that was specified for an annotation in raw form, i.e. a string. E.g. \"2 cm\" or \"2-4 cm\"", - "type": "string" - }, - "was_generated_by": { - "type": "string" - } - }, - "title": "IntegerValue", - "type": "object" - }, - "LabDevice": { - "additionalProperties": false, - "description": "", - "properties": { - "activity_speed": { - "$ref": "#/$defs/QuantityValue" - }, - "activity_temperature": { - "$ref": "#/$defs/QuantityValue" - }, - "activity_time": { - "$ref": "#/$defs/QuantityValue" - }, - "device_type": { - "$ref": "#/$defs/DeviceTypeEnum" - } - }, - "title": "LabDevice", - "type": "object" - }, - "LightTypeEnum": { - "description": "", - "enum": [ - "natural light", - "electric light", - "desk lamp", - "flourescent lights", - "none" - ], - "title": "LightTypeEnum", - "type": "string" - }, - "LithologyEnum": { - "description": "", - "enum": [ - "Basement", - "Chalk", - "Chert", - "Coal", - "Conglomerate", - "Diatomite", - "Dolomite", - "Limestone", - "Sandstone", - "Shale", - "Siltstone", - "Volcanic", - "other" - ], - "title": "LithologyEnum", - "type": "string" - }, - "MagBin": { - "additionalProperties": false, - "description": "", - "properties": { - "bin_name": { - "type": "string" - }, - "bin_quality": { - "type": "string" - }, - "completeness": { - "type": "number" - }, - "contamination": { - "type": "number" - }, - "gene_count": { - "type": "integer" - }, - "gtdbtk_class": { - "type": "string" - }, - "gtdbtk_domain": { - "type": "string" - }, - "gtdbtk_family": { - "type": "string" - }, - "gtdbtk_genus": { - "type": "string" - }, - "gtdbtk_order": { - "type": "string" - }, - "gtdbtk_phylum": { - "type": "string" - }, - "gtdbtk_species": { - "type": "string" - }, - "num_16s": { - "type": "integer" - }, - "num_23s": { - "type": "integer" - }, - "num_5s": { - "type": "integer" - }, - "num_t_rna": { - "type": "integer" - }, - "number_of_contig": { - "type": "integer" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - } - }, - "title": "MagBin", - "type": "object" - }, - "MagsAnalysisActivity": { - "additionalProperties": false, - "description": "A workflow execution activity that uses computational binning tools to group assembled contigs into genomes", - "properties": { - "binned_contig_num": { - "type": "integer" - }, - "ended_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "execution_resource": { - "description": "Example: NERSC-Cori", - "type": "string" - }, - "git_url": { - "description": "Example: https://github.com/microbiomedata/mg_annotation/releases/tag/0.1", - "type": "string" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "has_output": { - "description": "An output biosample to a processing step", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfmag-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "input_contig_num": { - "type": "integer" - }, - "low_depth_contig_num": { - "type": "integer" - }, - "mags_list": { - "items": { - "$ref": "#/$defs/MagBin" - }, - "type": "array" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "started_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "too_short_contig_num": { - "type": "integer" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "unbinned_contig_num": { - "type": "integer" - }, - "used": { - "type": "string" - }, - "was_associated_with": { - "description": "the agent/entity associated with the generation of the file", - "type": "string" - }, - "was_informed_by": { - "type": "string" - } - }, - "required": [ - "execution_resource", - "git_url", - "has_input", - "has_output", - "started_at_time", - "ended_at_time", - "was_informed_by" - ], - "title": "MagsAnalysisActivity", - "type": "object" - }, - "MaterialContainer": { - "additionalProperties": false, - "description": "", - "properties": { - "container_size": { - "$ref": "#/$defs/QuantityValue" - }, - "container_type": { - "$ref": "#/$defs/ContainerTypeEnum" - } - }, - "title": "MaterialContainer", - "type": "object" - }, - "MaterialSample": { - "additionalProperties": false, - "description": "", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):matsm-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - } - }, - "title": "MaterialSample", - "type": "object" - }, - "MaterialSamplingActivity": { - "additionalProperties": false, - "description": "", - "properties": { - "amount_collected": { - "$ref": "#/$defs/QuantityValue" - }, - "biosample_input": { - "type": "string" - }, - "collected_into": { - "$ref": "#/$defs/MaterialContainer" - }, - "material_output": { - "type": "string" - }, - "sampling_method": { - "$ref": "#/$defs/SamplingMethodEnum" - } - }, - "title": "MaterialSamplingActivity", - "type": "object" - }, - "MechStrucEnum": { - "description": "", - "enum": [ - "subway", - "coach", - "carriage", - "elevator", - "escalator", - "boat", - "train", - "car", - "bus" - ], - "title": "MechStrucEnum", - "type": "string" - }, - "MetaboliteQuantification": { - "additionalProperties": false, - "description": "This is used to link a metabolomics analysis workflow to a specific metabolite", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - } - }, - "title": "MetaboliteQuantification", - "type": "object" - }, - "MetabolomicsAnalysisActivity": { - "additionalProperties": false, - "description": "", - "properties": { - "ended_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "execution_resource": { - "description": "Example: NERSC-Cori", - "type": "string" - }, - "git_url": { - "description": "Example: https://github.com/microbiomedata/mg_annotation/releases/tag/0.1", - "type": "string" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "has_output": { - "description": "An output biosample to a processing step", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfmb-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "started_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "used": { - "description": "The instrument used to collect the data used in the analysis", - "type": "string" - }, - "was_associated_with": { - "description": "the agent/entity associated with the generation of the file", - "type": "string" - }, - "was_informed_by": { - "type": "string" - } - }, - "required": [ - "execution_resource", - "git_url", - "has_input", - "has_output", - "started_at_time", - "ended_at_time", - "was_informed_by" - ], - "title": "MetabolomicsAnalysisActivity", - "type": "object" - }, - "MetagenomeAnnotationActivity": { - "additionalProperties": false, - "description": "A workflow execution activity that provides functional and structural annotation of assembled metagenome contigs", - "properties": { - "ended_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "execution_resource": { - "description": "Example: NERSC-Cori", - "type": "string" - }, - "git_url": { - "description": "Example: https://github.com/microbiomedata/mg_annotation/releases/tag/0.1", - "type": "string" - }, - "gold_analysis_project_identifiers": { - "description": "identifiers for corresponding analysis project in GOLD", - "items": { - "type": "string" - }, - "pattern": "^GOLD:Ga[0-9]+$", - "type": "array" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "has_output": { - "description": "An output biosample to a processing step", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfmgan-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "started_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "used": { - "type": "string" - }, - "was_associated_with": { - "description": "the agent/entity associated with the generation of the file", - "type": "string" - }, - "was_informed_by": { - "type": "string" - } - }, - "required": [ - "execution_resource", - "git_url", - "has_input", - "has_output", - "started_at_time", - "ended_at_time", - "was_informed_by" - ], - "title": "MetagenomeAnnotationActivity", - "type": "object" - }, - "MetagenomeAssembly": { - "additionalProperties": false, - "description": "A workflow execution activity that converts sequencing reads into an assembled metagenome.", - "properties": { - "asm_score": { - "description": "A score for comparing metagenomic assembly quality from same sample.", - "type": "number" - }, - "contig_bp": { - "description": "Total size in bp of all contigs.", - "type": "number" - }, - "contigs": { - "description": "The sum of the (length*log(length)) of all contigs, times some constant. Increase the contiguity, the score will increase", - "type": "number" - }, - "ctg_l50": { - "description": "Given a set of contigs, the L50 is defined as the sequence length of the shortest contig at 50% of the total genome length.", - "type": "number" - }, - "ctg_l90": { - "description": "The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all contigs of that length or longer contains at least 90% of the sum of the lengths of all contigs.", - "type": "number" - }, - "ctg_logsum": { - "description": "Maximum contig length.", - "type": "number" - }, - "ctg_max": { - "description": "Maximum contig length.", - "type": "number" - }, - "ctg_n50": { - "description": "Given a set of contigs, each with its own length, the N50 count is defined as the smallest number_of_contigs whose length sum makes up half of genome size.", - "type": "number" - }, - "ctg_n90": { - "description": "Given a set of contigs, each with its own length, the N90 count is defined as the smallest number of contigs whose length sum makes up 90% of genome size.", - "type": "number" - }, - "ctg_powsum": { - "description": "Powersum of all contigs is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).", - "type": "number" - }, - "ended_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "execution_resource": { - "description": "Example: NERSC-Cori", - "type": "string" - }, - "gap_pct": { - "description": "The gap size percentage of all scaffolds.", - "type": "number" - }, - "gc_avg": { - "description": "Average of GC content of all contigs.", - "type": "number" - }, - "gc_std": { - "description": "Standard deviation of GC content of all contigs.", - "type": "number" - }, - "git_url": { - "description": "Example: https://github.com/microbiomedata/mg_annotation/releases/tag/0.1", - "type": "string" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "has_output": { - "description": "An output biosample to a processing step", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfmgas-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "insdc_assembly_identifiers": { - "pattern": "^insdc.sra:[A-Z]+[0-9]+(\\.[0-9]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "num_aligned_reads": { - "description": "The sequence count number of input reads aligned to assembled contigs.", - "type": "number" - }, - "num_input_reads": { - "description": "The sequence count number of input reads for assembly.", - "type": "number" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "scaf_bp": { - "description": "Total size in bp of all scaffolds.", - "type": "number" - }, - "scaf_l50": { - "description": "Given a set of scaffolds, the L50 is defined as the sequence length of the shortest scaffold at 50% of the total genome length.", - "type": "number" - }, - "scaf_l90": { - "description": "The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all scaffolds of that length or longer contains at least 90% of the sum of the lengths of all scaffolds.", - "type": "number" - }, - "scaf_l_gt50k": { - "description": "Total size in bp of all scaffolds greater than 50 KB.", - "type": "number" - }, - "scaf_logsum": { - "description": "The sum of the (length*log(length)) of all scaffolds, times some constant. Increase the contiguity, the score will increase", - "type": "number" - }, - "scaf_max": { - "description": "Maximum scaffold length.", - "type": "number" - }, - "scaf_n50": { - "description": "Given a set of scaffolds, each with its own length, the N50 count is defined as the smallest number of scaffolds whose length sum makes up half of genome size.", - "type": "number" - }, - "scaf_n90": { - "description": "Given a set of scaffolds, each with its own length, the N90 count is defined as the smallest number of scaffolds whose length sum makes up 90% of genome size.", - "type": "number" - }, - "scaf_n_gt50k": { - "description": "Total sequence count of scaffolds greater than 50 KB.", - "type": "number" - }, - "scaf_pct_gt50k": { - "description": "Total sequence size percentage of scaffolds greater than 50 KB.", - "type": "number" - }, - "scaf_powsum": { - "description": "Powersum of all scaffolds is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).", - "type": "number" - }, - "scaffolds": { - "description": "Total sequence count of all scaffolds.", - "type": "number" - }, - "started_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "used": { - "type": "string" - }, - "was_associated_with": { - "description": "the agent/entity associated with the generation of the file", - "type": "string" - }, - "was_informed_by": { - "type": "string" - } - }, - "required": [ - "execution_resource", - "git_url", - "has_input", - "has_output", - "started_at_time", - "ended_at_time", - "was_informed_by" - ], - "title": "MetagenomeAssembly", - "type": "object" - }, - "MetaproteomicsAnalysisActivity": { - "additionalProperties": false, - "description": "", - "properties": { - "ended_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "execution_resource": { - "description": "Example: NERSC-Cori", - "type": "string" - }, - "git_url": { - "description": "Example: https://github.com/microbiomedata/mg_annotation/releases/tag/0.1", - "type": "string" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "has_output": { - "description": "An output biosample to a processing step", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfmp-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "started_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "used": { - "description": "The instrument used to collect the data used in the analysis", - "type": "string" - }, - "was_associated_with": { - "description": "the agent/entity associated with the generation of the file", - "type": "string" - }, - "was_informed_by": { - "type": "string" - } - }, - "required": [ - "execution_resource", - "git_url", - "has_input", - "has_output", - "started_at_time", - "ended_at_time", - "was_informed_by" - ], - "title": "MetaproteomicsAnalysisActivity", - "type": "object" - }, - "MetatranscriptomeActivity": { - "additionalProperties": false, - "description": "A metatranscriptome activity that e.g. pools assembly and annotation activity.", - "properties": { - "ended_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "execution_resource": { - "description": "Example: NERSC-Cori", - "type": "string" - }, - "git_url": { - "description": "Example: https://github.com/microbiomedata/mg_annotation/releases/tag/0.1", - "type": "string" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "has_output": { - "description": "An output biosample to a processing step", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfmt-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "started_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "used": { - "type": "string" - }, - "was_associated_with": { - "description": "the agent/entity associated with the generation of the file", - "type": "string" - }, - "was_informed_by": { - "type": "string" - } - }, - "required": [ - "execution_resource", - "git_url", - "has_input", - "has_output", - "started_at_time", - "ended_at_time", - "was_informed_by" - ], - "title": "MetatranscriptomeActivity", - "type": "object" - }, - "MetatranscriptomeAnnotationActivity": { - "additionalProperties": false, - "description": "", - "properties": { - "ended_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "execution_resource": { - "description": "Example: NERSC-Cori", - "type": "string" - }, - "git_url": { - "description": "Example: https://github.com/microbiomedata/mg_annotation/releases/tag/0.1", - "type": "string" - }, - "gold_analysis_project_identifiers": { - "description": "identifiers for corresponding analysis project in GOLD", - "items": { - "type": "string" - }, - "pattern": "^GOLD:Ga[0-9]+$", - "type": "array" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "has_output": { - "description": "An output biosample to a processing step", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfmtan-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "started_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "used": { - "type": "string" - }, - "was_associated_with": { - "description": "the agent/entity associated with the generation of the file", - "type": "string" - }, - "was_informed_by": { - "type": "string" - } - }, - "required": [ - "execution_resource", - "git_url", - "has_input", - "has_output", - "id", - "started_at_time", - "ended_at_time", - "was_informed_by" - ], - "title": "MetatranscriptomeAnnotationActivity", - "type": "object" - }, - "MetatranscriptomeAssembly": { - "additionalProperties": false, - "description": "", - "properties": { - "asm_score": { - "description": "A score for comparing metagenomic assembly quality from same sample.", - "type": "number" - }, - "contig_bp": { - "description": "Total size in bp of all contigs.", - "type": "number" - }, - "contigs": { - "description": "The sum of the (length*log(length)) of all contigs, times some constant. Increase the contiguity, the score will increase", - "type": "number" - }, - "ctg_l50": { - "description": "Given a set of contigs, the L50 is defined as the sequence length of the shortest contig at 50% of the total genome length.", - "type": "number" - }, - "ctg_l90": { - "description": "The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all contigs of that length or longer contains at least 90% of the sum of the lengths of all contigs.", - "type": "number" - }, - "ctg_logsum": { - "description": "Maximum contig length.", - "type": "number" - }, - "ctg_max": { - "description": "Maximum contig length.", - "type": "number" - }, - "ctg_n50": { - "description": "Given a set of contigs, each with its own length, the N50 count is defined as the smallest number_of_contigs whose length sum makes up half of genome size.", - "type": "number" - }, - "ctg_n90": { - "description": "Given a set of contigs, each with its own length, the N90 count is defined as the smallest number of contigs whose length sum makes up 90% of genome size.", - "type": "number" - }, - "ctg_powsum": { - "description": "Powersum of all contigs is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).", - "type": "number" - }, - "ended_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "execution_resource": { - "description": "Example: NERSC-Cori", - "type": "string" - }, - "gap_pct": { - "description": "The gap size percentage of all scaffolds.", - "type": "number" - }, - "gc_avg": { - "description": "Average of GC content of all contigs.", - "type": "number" - }, - "gc_std": { - "description": "Standard deviation of GC content of all contigs.", - "type": "number" - }, - "git_url": { - "description": "Example: https://github.com/microbiomedata/mg_annotation/releases/tag/0.1", - "type": "string" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "has_output": { - "description": "An output biosample to a processing step", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfmtas-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "insdc_assembly_identifiers": { - "pattern": "^insdc.sra:[A-Z]+[0-9]+(\\.[0-9]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "num_aligned_reads": { - "description": "The sequence count number of input reads aligned to assembled contigs.", - "type": "number" - }, - "num_input_reads": { - "description": "The sequence count number of input reads for assembly.", - "type": "number" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "scaf_bp": { - "description": "Total size in bp of all scaffolds.", - "type": "number" - }, - "scaf_l50": { - "description": "Given a set of scaffolds, the L50 is defined as the sequence length of the shortest scaffold at 50% of the total genome length.", - "type": "number" - }, - "scaf_l90": { - "description": "The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all scaffolds of that length or longer contains at least 90% of the sum of the lengths of all scaffolds.", - "type": "number" - }, - "scaf_l_gt50k": { - "description": "Total size in bp of all scaffolds greater than 50 KB.", - "type": "number" - }, - "scaf_logsum": { - "description": "The sum of the (length*log(length)) of all scaffolds, times some constant. Increase the contiguity, the score will increase", - "type": "number" - }, - "scaf_max": { - "description": "Maximum scaffold length.", - "type": "number" - }, - "scaf_n50": { - "description": "Given a set of scaffolds, each with its own length, the N50 count is defined as the smallest number of scaffolds whose length sum makes up half of genome size.", - "type": "number" - }, - "scaf_n90": { - "description": "Given a set of scaffolds, each with its own length, the N90 count is defined as the smallest number of scaffolds whose length sum makes up 90% of genome size.", - "type": "number" - }, - "scaf_n_gt50k": { - "description": "Total sequence count of scaffolds greater than 50 KB.", - "type": "number" - }, - "scaf_pct_gt50k": { - "description": "Total sequence size percentage of scaffolds greater than 50 KB.", - "type": "number" - }, - "scaf_powsum": { - "description": "Powersum of all scaffolds is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).", - "type": "number" - }, - "scaffolds": { - "description": "Total sequence count of all scaffolds.", - "type": "number" - }, - "started_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "used": { - "type": "string" - }, - "was_associated_with": { - "description": "the agent/entity associated with the generation of the file", - "type": "string" - }, - "was_informed_by": { - "type": "string" - } - }, - "required": [ - "execution_resource", - "git_url", - "has_input", - "has_output", - "id", - "started_at_time", - "ended_at_time", - "was_informed_by" - ], - "title": "MetatranscriptomeAssembly", - "type": "object" - }, - "NomAnalysisActivity": { - "additionalProperties": false, - "description": "", - "properties": { - "ended_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "execution_resource": { - "description": "Example: NERSC-Cori", - "type": "string" - }, - "git_url": { - "description": "Example: https://github.com/microbiomedata/mg_annotation/releases/tag/0.1", - "type": "string" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "has_output": { - "description": "An output biosample to a processing step", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfnom-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "started_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "used": { - "description": "The instrument used to collect the data used in the analysis", - "type": "string" - }, - "was_associated_with": { - "description": "the agent/entity associated with the generation of the file", - "type": "string" - }, - "was_informed_by": { - "type": "string" - } - }, - "required": [ - "execution_resource", - "git_url", - "has_input", - "has_output", - "started_at_time", - "ended_at_time", - "was_informed_by" - ], - "title": "NomAnalysisActivity", - "type": "object" - }, - "OccupDocumentEnum": { - "description": "", - "enum": [ - "automated count", - "estimate", - "manual count", - "videos" - ], - "title": "OccupDocumentEnum", - "type": "string" - }, - "OmicsProcessing": { - "additionalProperties": false, - "description": "The methods and processes used to generate omics data from a biosample or organism.", - "properties": { - "add_date": { - "description": "The date on which the information was added to the database.", - "type": "string" - }, - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "chimera_check": { - "$ref": "#/$defs/TextValue", - "description": "Tool(s) used for chimera checking, including version number and parameters, to discover and remove chimeric sequences. A chimeric sequence is comprised of two or more phylogenetically distinct parent sequences." - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "gold_sequencing_project_identifiers": { - "description": "identifiers for corresponding sequencing project in GOLD", - "items": { - "type": "string" - }, - "pattern": "^GOLD:Gp[0-9]+$", - "type": "array" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "has_output": { - "description": "An output biosample to a processing step", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):omprc-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "insdc_experiment_identifiers": { - "items": { - "type": "string" - }, - "pattern": "^insdc.sra:(E|D|S)RX[0-9]{6,}$", - "type": "array" - }, - "instrument_name": { - "description": "The name of the instrument that was used for processing the sample.", - "type": "string" - }, - "mod_date": { - "description": "The last date on which the database information was modified.", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "ncbi_project_name": { - "type": "string" - }, - "nucl_acid_amp": { - "$ref": "#/$defs/TextValue", - "description": "A link to a literature reference, electronic resource or a standard operating procedure (SOP), that describes the enzymatic amplification (PCR, TMA, NASBA) of specific nucleic acids" - }, - "nucl_acid_ext": { - "$ref": "#/$defs/TextValue", - "description": "A link to a literature reference, electronic resource or a standard operating procedure (SOP), that describes the material separation to recover the nucleic acid fraction from a sample" - }, - "omics_type": { - "$ref": "#/$defs/ControlledTermValue", - "description": "The type of omics data" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "pcr_cond": { - "$ref": "#/$defs/TextValue", - "description": "Description of reaction conditions and components of PCR in the form of 'initial denaturation:94degC_1.5min; annealing=...'" - }, - "pcr_primers": { - "$ref": "#/$defs/TextValue", - "description": "PCR primers that were used to amplify the sequence of the targeted gene, locus or subfragment. This field should contain all the primers used for a single PCR reaction if multiple forward or reverse primers are present in a single PCR reaction. The primer sequence should be reported in uppercase letters" - }, - "principal_investigator": { - "$ref": "#/$defs/PersonValue", - "description": "Principal Investigator who led the study and/or generated the dataset." - }, - "processing_institution": { - "$ref": "#/$defs/ProcessingInstitutionEnum", - "description": "The organization that processed the sample." - }, - "samp_vol_we_dna_ext": { - "$ref": "#/$defs/QuantityValue", - "description": "Volume (ml) or mass (g) of total collected sample processed for DNA extraction. Note: total sample collected should be entered under the term Sample Size (mixs:0000001)." - }, - "seq_meth": { - "$ref": "#/$defs/TextValue", - "description": "Sequencing machine used. Where possible the term should be taken from the OBI list of DNA sequencers (http://purl.obolibrary.org/obo/OBI_0400103)." - }, - "seq_quality_check": { - "$ref": "#/$defs/TextValue", - "description": "Indicate if the sequence has been called by automatic systems (none) or undergone a manual editing procedure (e.g. by inspecting the raw data or chromatograms). Applied only for sequences that are not submitted to SRA,ENA or DRA" - }, - "target_gene": { - "$ref": "#/$defs/TextValue", - "description": "Targeted gene or locus name for marker gene studies" - }, - "target_subfragment": { - "$ref": "#/$defs/TextValue", - "description": "Name of subfragment of a gene or locus. Important to e.g. identify special regions on marker genes like V6 on 16S rRNA" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - } - }, - "required": [ - "has_input" - ], - "title": "OmicsProcessing", - "type": "object" - }, - "OntologyClass": { - "additionalProperties": false, - "description": "", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - } - }, - "required": [ - "id" - ], - "title": "OntologyClass", - "type": "object" - }, - "OrganismCountEnum": { - "description": "", - "enum": [ - "ATP", - "MPN", - "other" - ], - "title": "OrganismCountEnum", - "type": "string" - }, - "OrthologyGroup": { - "additionalProperties": false, - "description": "A set of genes or gene products in which all members are orthologous", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - } - }, - "required": [ - "id" - ], - "title": "OrthologyGroup", - "type": "object" - }, - "OxyStatSampEnum": { - "description": "", - "enum": [ - "aerobic", - "anaerobic", - "other" - ], - "title": "OxyStatSampEnum", - "type": "string" - }, - "Pathway": { - "additionalProperties": false, - "description": "A pathway is a sequence of steps/reactions carried out by an organism or community of organisms", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - } - }, - "required": [ - "id" - ], - "title": "Pathway", - "type": "object" - }, - "PeptideQuantification": { - "additionalProperties": false, - "description": "This is used to link a metaproteomics analysis workflow to a specific peptide sequence and related information", - "title": "PeptideQuantification", - "type": "object" - }, - "Person": { - "additionalProperties": false, - "description": "represents a person, such as a researcher", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "id": { - "description": "Should be an ORCID. Specify in CURIE format. E.g ORCID:0000-1111-...", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - } - }, - "required": [ - "id" - ], - "title": "Person", - "type": "object" - }, - "PersonValue": { - "additionalProperties": false, - "description": "An attribute value representing a person", - "properties": { - "email": { - "description": "An email address for an entity such as a person. This should be the primarly email address used.", - "type": "string" - }, - "has_raw_value": { - "description": "The full name of the Investigator in format FIRST LAST.", - "type": "string" - }, - "name": { - "description": "The full name of the Investigator. It should follow the format FIRST [MIDDLE NAME| MIDDLE INITIAL] LAST, where MIDDLE NAME| MIDDLE INITIAL is optional.", - "type": "string" - }, - "orcid": { - "description": "The ORCID of a person.", - "type": "string" - }, - "profile_image_url": { - "description": "A url that points to an image of a person.", - "type": "string" - }, - "was_generated_by": { - "type": "string" - }, - "websites": { - "description": "A list of websites that are associated with the entity.", - "items": { - "type": "string" - }, - "type": "array" - } - }, - "title": "PersonValue", - "type": "object" - }, - "PlantGrowthMedEnum": { - "description": "", - "enum": [ - "other artificial liquid medium", - "other artificial solid medium", - "peat moss", - "perlite", - "pumice", - "sand", - "soil", - "vermiculite", - "water" - ], - "title": "PlantGrowthMedEnum", - "type": "string" - }, - "PlantSexEnum": { - "description": "", - "enum": [ - "Androdioecious", - "Androecious", - "Androgynous", - "Androgynomonoecious", - "Andromonoecious", - "Bisexual", - "Dichogamous", - "Diclinous", - "Dioecious", - "Gynodioecious", - "Gynoecious", - "Gynomonoecious", - "Hermaphroditic", - "Imperfect", - "Monoclinous", - "Monoecious", - "Perfect", - "Polygamodioecious", - "Polygamomonoecious", - "Polygamous", - "Protandrous", - "Protogynous", - "Subandroecious", - "Subdioecious", - "Subgynoecious", - "Synoecious", - "Trimonoecious", - "Trioecious", - "Unisexual" - ], - "title": "PlantSexEnum", - "type": "string" - }, - "ProcessingInstitutionEnum": { - "description": "", - "enum": [ - "UCSD", - "JGI", - "EMSL" - ], - "title": "ProcessingInstitutionEnum", - "type": "string" - }, - "ProfilePositionEnum": { - "description": "", - "enum": [ - "summit", - "shoulder", - "backslope", - "footslope", - "toeslope" - ], - "title": "ProfilePositionEnum", - "type": "string" - }, - "ProteinQuantification": { - "additionalProperties": false, - "description": "This is used to link a metaproteomics analysis workflow to a specific protein", - "title": "ProteinQuantification", - "type": "object" - }, - "QuadPosEnum": { - "description": "", - "enum": [ - "North side", - "West side", - "South side", - "East side" - ], - "title": "QuadPosEnum", - "type": "string" - }, - "QuantityValue": { - "additionalProperties": false, - "description": "A simple quantity, e.g. 2cm", - "properties": { - "has_maximum_numeric_value": { - "description": "The maximum value part, expressed as number, of the quantity value when the value covers a range.", - "type": "number" - }, - "has_minimum_numeric_value": { - "description": "The minimum value part, expressed as number, of the quantity value when the value covers a range.", - "type": "number" - }, - "has_numeric_value": { - "description": "The number part of the quantity", - "type": "number" - }, - "has_raw_value": { - "description": "Unnormalized atomic string representation, should in syntax {number} {unit}", - "type": "string" - }, - "has_unit": { - "description": "The unit of the quantity", - "type": "string" - }, - "was_generated_by": { - "type": "string" - } - }, - "title": "QuantityValue", - "type": "object" - }, - "Reaction": { - "additionalProperties": false, - "description": "An individual biochemical transformation carried out by a functional unit of an organism, in which a collection of substrates are transformed into a collection of products. Can also represent transporters", - "properties": { - "alternative_identifiers": { - "description": "A list of alternative identifiers for the entity.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "a human-readable description of a thing", - "type": "string" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - } - }, - "required": [ - "id" - ], - "title": "Reaction", - "type": "object" - }, - "ReactionActivity": { - "additionalProperties": false, - "description": "", - "properties": { - "material_input": { - "type": "string" - }, - "material_output": { - "type": "string" - }, - "reaction_aided_by": { - "$ref": "#/$defs/LabDevice" - }, - "reaction_temperature": { - "type": "string" - }, - "reaction_time": { - "$ref": "#/$defs/QuantityValue" - } - }, - "title": "ReactionActivity", - "type": "object" - }, - "ReactionParticipant": { - "additionalProperties": false, - "description": "Instances of this link a reaction to a chemical entity participant", - "title": "ReactionParticipant", - "type": "object" - }, - "ReadBasedTaxonomyAnalysisActivity": { - "additionalProperties": false, - "description": "A workflow execution activity that performs taxonomy classification using sequencing reads", - "properties": { - "ended_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "execution_resource": { - "description": "Example: NERSC-Cori", - "type": "string" - }, - "git_url": { - "description": "Example: https://github.com/microbiomedata/mg_annotation/releases/tag/0.1", - "type": "string" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "has_output": { - "description": "An output biosample to a processing step", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfrbt-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "started_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "used": { - "type": "string" - }, - "was_associated_with": { - "description": "the agent/entity associated with the generation of the file", - "type": "string" - }, - "was_informed_by": { - "type": "string" - } - }, - "required": [ - "execution_resource", - "git_url", - "has_input", - "has_output", - "started_at_time", - "ended_at_time", - "was_informed_by" - ], - "title": "ReadBasedTaxonomyAnalysisActivity", - "type": "object" - }, - "ReadQcAnalysisActivity": { - "additionalProperties": false, - "description": "A workflow execution activity that performs quality control on raw Illumina reads including quality trimming, artifact removal, linker trimming, adapter trimming, spike-in removal, and human/cat/dog/mouse/microbe contaminant removal", - "properties": { - "ended_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "execution_resource": { - "description": "Example: NERSC-Cori", - "type": "string" - }, - "git_url": { - "description": "Example: https://github.com/microbiomedata/mg_annotation/releases/tag/0.1", - "type": "string" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "has_output": { - "description": "An output biosample to a processing step", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wfrqc-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "input_base_count": { - "description": "The nucleotide base count number of input reads for QC analysis.", - "type": "number" - }, - "input_read_count": { - "description": "The sequence count number of input reads for QC analysis.", - "type": "number" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "output_base_count": { - "description": "After QC analysis nucleotide base count number.", - "type": "number" - }, - "output_read_count": { - "description": "After QC analysis sequence count number.", - "type": "number" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "started_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "used": { - "type": "string" - }, - "was_associated_with": { - "description": "the agent/entity associated with the generation of the file", - "type": "string" - }, - "was_informed_by": { - "type": "string" - } - }, - "required": [ - "execution_resource", - "git_url", - "has_input", - "has_output", - "started_at_time", - "ended_at_time", - "was_informed_by" - ], - "title": "ReadQcAnalysisActivity", - "type": "object" - }, - "RelSampLocEnum": { - "description": "", - "enum": [ - "edge of car", - "center of car", - "under a seat" - ], - "title": "RelSampLocEnum", - "type": "string" - }, - "RelToOxygenEnum": { - "description": "", - "enum": [ - "aerobe", - "anaerobe", - "facultative", - "microaerophilic", - "microanaerobe", - "obligate aerobe", - "obligate anaerobe" - ], - "title": "RelToOxygenEnum", - "type": "string" - }, - "RnaContTypeEnum": { - "description": "", - "enum": [ - "plate", - "tube" - ], - "title": "RnaContTypeEnum", - "type": "string" - }, - "RnaSampleFormatEnum": { - "description": "", - "enum": [ - "10 mM Tris-HCl", - "DNAStable", - "Ethanol", - "Low EDTA TE", - "MDA reaction buffer", - "PBS", - "Pellet", - "RNAStable", - "TE", - "Water", - "Gentegra-DNA", - "Gentegra-RNA" - ], - "title": "RnaSampleFormatEnum", - "type": "string" - }, - "RoomCondtEnum": { - "description": "", - "enum": [ - "new", - "visible wear", - "needs repair", - "damaged", - "rupture", - "visible signs of mold/mildew" - ], - "title": "RoomCondtEnum", - "type": "string" - }, - "RoomConnectedEnum": { - "description": "", - "enum": [ - "attic", - "bathroom", - "closet", - "conference room", - "elevator", - "examining room", - "hallway", - "kitchen", - "mail room", - "office", - "stairwell" - ], - "title": "RoomConnectedEnum", - "type": "string" - }, - "RoomLocEnum": { - "description": "", - "enum": [ - "corner room", - "interior room", - "exterior wall" - ], - "title": "RoomLocEnum", - "type": "string" - }, - "RoomSampPosEnum": { - "description": "", - "enum": [ - "north corner", - "south corner", - "west corner", - "east corner", - "northeast corner", - "northwest corner", - "southeast corner", - "southwest corner", - "center" - ], - "title": "RoomSampPosEnum", - "type": "string" - }, - "RoomTypeEnum": { - "description": "", - "enum": [ - "attic", - "bathroom", - "closet", - "conference room", - "elevator", - "examining room", - "hallway", - "kitchen", - "mail room", - "private office", - "open office", - "stairwell", - ",restroom", - "lobby", - "vestibule", - "mechanical or electrical room", - "data center", - "laboratory_wet", - "laboratory_dry", - "gymnasium", - "natatorium", - "auditorium", - "lockers", - "cafe", - "warehouse" - ], - "title": "RoomTypeEnum", - "type": "string" - }, - "SampCaptStatusEnum": { - "description": "", - "enum": [ - "active surveillance in response to an outbreak", - "active surveillance not initiated by an outbreak", - "farm sample", - "market sample", - "other" - ], - "title": "SampCaptStatusEnum", - "type": "string" - }, - "SampCollectPointEnum": { - "description": "", - "enum": [ - "well", - "test well", - "drilling rig", - "wellhead", - "separator", - "storage tank", - "other" - ], - "title": "SampCollectPointEnum", - "type": "string" - }, - "SampDisStageEnum": { - "description": "", - "enum": [ - "dissemination", - "growth and reproduction", - "infection", - "inoculation", - "penetration", - "other" - ], - "title": "SampDisStageEnum", - "type": "string" - }, - "SampFloorEnum": { - "description": "", - "enum": [ - "1st floor", - "2nd floor", - "basement", - "lobby" - ], - "title": "SampFloorEnum", - "type": "string" - }, - "SampMdEnum": { - "description": "", - "enum": [ - "DF", - "RT", - "KB", - "MSL", - "other" - ], - "title": "SampMdEnum", - "type": "string" - }, - "SampSubtypeEnum": { - "description": "", - "enum": [ - "oil phase", - "water phase", - "biofilm", - "not applicable", - "other" - ], - "title": "SampSubtypeEnum", - "type": "string" - }, - "SampWeatherEnum": { - "description": "", - "enum": [ - "clear sky", - "cloudy", - "foggy", - "hail", - "rain", - "snow", - "sleet", - "sunny", - "windy" - ], - "title": "SampWeatherEnum", - "type": "string" - }, - "SampleTypeEnum": { - "description": "", - "enum": [ - "soil", - "water_extract_soil" - ], - "title": "SampleTypeEnum", - "type": "string" - }, - "SamplingMethodEnum": { - "description": "", - "enum": [ - "weighing" - ], - "title": "SamplingMethodEnum", - "type": "string" - }, - "SeasonUseEnum": { - "description": "", - "enum": [ - "Spring", - "Summer", - "Fall", - "Winter" - ], - "title": "SeasonUseEnum", - "type": "string" - }, - "SedimentTypeEnum": { - "description": "", - "enum": [ - "biogenous", - "cosmogenous", - "hydrogenous", - "lithogenous" - ], - "title": "SedimentTypeEnum", - "type": "string" - }, - "ShadingDeviceCondEnum": { - "description": "", - "enum": [ - "damaged", - "needs repair", - "new", - "rupture", - "visible wear" - ], - "title": "ShadingDeviceCondEnum", - "type": "string" - }, - "ShadingDeviceTypeEnum": { - "description": "", - "enum": [ - "bahama shutters", - "exterior roll blind", - "gambrel awning", - "hood awning", - "porchroller awning", - "sarasota shutters", - "slatted aluminum", - "solid aluminum awning", - "sun screen", - "tree", - "trellis", - "venetian awning" - ], - "title": "ShadingDeviceTypeEnum", - "type": "string" - }, - "SoilHorizonEnum": { - "description": "", - "enum": [ - "O horizon", - "A horizon", - "E horizon", - "B horizon", - "C horizon", - "R layer", - "Permafrost" - ], - "title": "SoilHorizonEnum", - "type": "string" - }, - "SolventEnum": { - "description": "", - "enum": [ - "deionized_water", - "methanol", - "chloroform" - ], - "title": "SolventEnum", - "type": "string" - }, - "SpecificEnum": { - "description": "", - "enum": [ - "operation", - "as built", - "construction", - "bid", - "design", - "photos" - ], - "title": "SpecificEnum", - "type": "string" - }, - "SrDepEnvEnum": { - "description": "", - "enum": [ - "Lacustine", - "Fluvioldeltaic", - "Fluviomarine", - "Marine", - "other" - ], - "title": "SrDepEnvEnum", - "type": "string" - }, - "SrGeolAgeEnum": { - "description": "", - "enum": [ - "Archean", - "Cambrian", - "Carboniferous", - "Cenozoic", - "Cretaceous", - "Devonian", - "Jurassic", - "Mesozoic", - "Neogene", - "Ordovician", - "Paleogene", - "Paleozoic", - "Permian", - "Precambrian", - "Proterozoic", - "Silurian", - "Triassic", - "other" - ], - "title": "SrGeolAgeEnum", - "type": "string" - }, - "SrKerogTypeEnum": { - "description": "", - "enum": [ - "Type I", - "Type II", - "Type III", - "Type IV", - "other" - ], - "title": "SrKerogTypeEnum", - "type": "string" - }, - "SrLithologyEnum": { - "description": "", - "enum": [ - "Clastic", - "Carbonate", - "Coal", - "Biosilicieous", - "other" - ], - "title": "SrLithologyEnum", - "type": "string" - }, - "Study": { - "additionalProperties": false, - "description": "A study summarizes the overall goal of a research initiative and outlines the key objective of its underlying projects.", - "properties": { - "abstract": { - "description": "The abstract of manuscript/grant associated with the entity; i.e., a summary of the resource.", - "type": "string" - }, - "alternative_descriptions": { - "description": "A list of alternative descriptions for the entity. The distinction between description and alternative descriptions is application-specific.", - "items": { - "type": "string" - }, - "type": "array" - }, - "alternative_identifiers": { - "description": "Unique identifier for a study submitted to additional resources. Matches that which has been submitted to NMDC", - "items": { - "type": "string" - }, - "type": "array" - }, - "alternative_names": { - "description": "A list of alternative names used to refer to the entity. The distinction between name and alternative names is application-specific.", - "items": { - "type": "string" - }, - "type": "array" - }, - "alternative_titles": { - "description": "A list of alternative titles for the entity. The distinction between title and alternative titles is application-specific.", - "items": { - "type": "string" - }, - "type": "array" - }, - "description": { - "description": "A brief summary that details the study you're submitted to NMDC", - "type": "string" - }, - "doi": { - "$ref": "#/$defs/AttributeValue", - "description": "The dataset citation for this study" - }, - "ecosystem": { - "description": "An ecosystem is a combination of a physical environment (abiotic factors) and all the organisms (biotic factors) that interact with this environment. Ecosystem is in position 1/5 in a GOLD path.", - "type": "string" - }, - "ecosystem_category": { - "description": "Ecosystem categories represent divisions within the ecosystem based on specific characteristics of the environment from where an organism or sample is isolated. Ecosystem category is in position 2/5 in a GOLD path.", - "type": "string" - }, - "ecosystem_subtype": { - "description": "Ecosystem subtypes represent further subdivision of Ecosystem types into more distinct subtypes. Ecosystem subtype is in position 4/5 in a GOLD path.", - "type": "string" - }, - "ecosystem_type": { - "description": "Ecosystem types represent things having common characteristics within the Ecosystem Category. These common characteristics based grouping is still broad but specific to the characteristics of a given environment. Ecosystem type is in position 3/5 in a GOLD path.", - "type": "string" - }, - "emsl_proposal_doi": { - "description": "The DOI for the EMSL awarded study that relates to the NMDC submitted study", - "type": "string" - }, - "emsl_proposal_identifier": { - "description": "The proposal number assigned to the EMSL awarded study that relates to that which is represented in NMDC.", - "type": "string" - }, - "ess_dive_datasets": { - "description": "List of ESS-DIVE dataset DOIs", - "items": { - "type": "string" - }, - "type": "array" - }, - "funding_sources": { - "items": { - "type": "string" - }, - "type": "array" - }, - "gold_study_identifiers": { - "description": "identifiers for corresponding project(s) in GOLD", - "items": { - "type": "string" - }, - "pattern": "^GOLD:Gs[0-9]+$", - "type": "array" - }, - "has_credit_associations": { - "description": "This slot links a study to a credit association. The credit association will be linked to a person value and to a CRediT Contributor Roles term. Overall semantics: person should get credit X for their participation in the study", - "items": { - "$ref": "#/$defs/CreditAssociation" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):sty-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "mgnify_project_identifiers": { - "description": "identifiers for corresponding project in MGnify", - "pattern": "^mgnify.proj:[A-Z]+[0-9]+$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "objective": { - "description": "The scientific objectives associated with the entity. It SHOULD correspond to scientific norms for objectives field in a structured abstract.", - "type": "string" - }, - "principal_investigator": { - "$ref": "#/$defs/PersonValue", - "description": "Principal Investigator who led the study and/or generated the dataset." - }, - "publications": { - "description": "A list of publications that are associated with the entity. The publications SHOULD be given using an identifier, such as a DOI or Pubmed ID, if possible.", - "items": { - "type": "string" - }, - "type": "array" - }, - "related_identifiers": { - "description": "Unique identifier for a study submitted to additional resources. Similar, but not necessarily identical to that which has been submitted to NMDC", - "type": "string" - }, - "relevant_protocols": { - "items": { - "type": "string" - }, - "type": "array" - }, - "specific_ecosystem": { - "description": "Specific ecosystems represent specific features of the environment like aphotic zone in an ocean or gastric mucosa within a host digestive system. Specific ecosystem is in position 5/5 in a GOLD path.", - "type": "string" - }, - "study_image": { - "description": "Links a study to one or more images.", - "items": { - "$ref": "#/$defs/ImageValue" - }, - "type": "array" - }, - "title": { - "description": "A name given to the entity that differs from the name/label programmatically assigned to it. For example, when extracting study information for GOLD, the GOLD system has assigned a name/label. However, for display purposes, we may also wish the capture the title of the proposal that was used to fund the study.", - "type": "string" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "websites": { - "description": "A list of websites that are associated with the entity.", - "items": { - "type": "string" - }, - "type": "array" - } - }, - "title": "Study", - "type": "object" - }, - "SubstructureTypeEnum": { - "description": "", - "enum": [ - "crawlspace", - "slab on grade", - "basement" - ], - "title": "SubstructureTypeEnum", - "type": "string" - }, - "SurfAirContEnum": { - "description": "", - "enum": [ - "dust", - "organic matter", - "particulate matter", - "volatile organic compounds", - "biological contaminants", - "radon", - "nutrients", - "biocides" - ], - "title": "SurfAirContEnum", - "type": "string" - }, - "SurfMaterialEnum": { - "description": "", - "enum": [ - "adobe", - "carpet", - "cinder blocks", - "concrete", - "hay bales", - "glass", - "metal", - "paint", - "plastic", - "stainless steel", - "stone", - "stucco", - "tile", - "vinyl", - "wood" - ], - "title": "SurfMaterialEnum", - "type": "string" - }, - "TextValue": { - "additionalProperties": false, - "description": "A basic string value", - "properties": { - "has_raw_value": { - "description": "The value that was specified for an annotation in raw form, i.e. a string. E.g. \"2 cm\" or \"2-4 cm\"", - "type": "string" - }, - "language": { - "description": "Should use ISO 639-1 code e.g. \"en\", \"fr\"", - "type": "string" - }, - "was_generated_by": { - "type": "string" - } - }, - "title": "TextValue", - "type": "object" - }, - "TidalStageEnum": { - "description": "", - "enum": [ - "low tide", - "ebb tide", - "flood tide", - "high tide" - ], - "title": "TidalStageEnum", - "type": "string" - }, - "TillageEnum": { - "description": "", - "enum": [ - "drill", - "cutting disc", - "ridge till", - "strip tillage", - "zonal tillage", - "chisel", - "tined", - "mouldboard", - "disc plough" - ], - "title": "TillageEnum", - "type": "string" - }, - "TimestampValue": { - "additionalProperties": false, - "description": "A value that is a timestamp. The range should be ISO-8601", - "properties": { - "has_raw_value": { - "description": "The value that was specified for an annotation in raw form, i.e. a string. E.g. \"2 cm\" or \"2-4 cm\"", - "type": "string" - }, - "was_generated_by": { - "type": "string" - } - }, - "title": "TimestampValue", - "type": "object" - }, - "TrainLineEnum": { - "description": "", - "enum": [ - "red", - "green", - "orange" - ], - "title": "TrainLineEnum", - "type": "string" - }, - "TrainStatLocEnum": { - "description": "", - "enum": [ - "south station above ground", - "south station underground", - "south station amtrak", - "forest hills", - "riverside" - ], - "title": "TrainStatLocEnum", - "type": "string" - }, - "TrainStopLocEnum": { - "description": "", - "enum": [ - "end", - "mid", - "downtown" - ], - "title": "TrainStopLocEnum", - "type": "string" - }, - "UrlValue": { - "additionalProperties": false, - "description": "A value that is a string that conforms to URL syntax", - "properties": { - "has_raw_value": { - "description": "The value that was specified for an annotation in raw form, i.e. a string. E.g. \"2 cm\" or \"2-4 cm\"", - "type": "string" - }, - "was_generated_by": { - "type": "string" - } - }, - "title": "UrlValue", - "type": "object" - }, - "VisMediaEnum": { - "description": "", - "enum": [ - "photos", - "videos", - "commonly of the building", - "site context (adjacent buildings, vegetation, terrain, streets)", - "interiors", - "equipment", - "3D scans" - ], - "title": "VisMediaEnum", - "type": "string" - }, - "WallConstTypeEnum": { - "description": "", - "enum": [ - "frame construction", - "joisted masonry", - "light noncombustible", - "masonry noncombustible", - "modified fire resistive", - "fire resistive" - ], - "title": "WallConstTypeEnum", - "type": "string" - }, - "WallFinishMatEnum": { - "description": "", - "enum": [ - "plaster", - "gypsum plaster", - "veneer plaster", - "gypsum board", - "tile", - "terrazzo", - "stone facing", - "acoustical treatment", - "wood", - "metal", - "masonry" - ], - "title": "WallFinishMatEnum", - "type": "string" - }, - "WallLocEnum": { - "description": "", - "enum": [ - "north", - "south", - "east", - "west" - ], - "title": "WallLocEnum", - "type": "string" - }, - "WallSurfTreatmentEnum": { - "description": "", - "enum": [ - "painted", - "wall paper", - "no treatment", - "paneling", - "stucco", - "fabric" - ], - "title": "WallSurfTreatmentEnum", - "type": "string" - }, - "WallTextureEnum": { - "description": "", - "enum": [ - "crows feet", - "crows-foot stomp", - "double skip", - "hawk and trowel", - "knockdown", - "popcorn", - "orange peel", - "rosebud stomp", - "Santa-Fe texture", - "skip trowel", - "smooth", - "stomp knockdown", - "swirl" - ], - "title": "WallTextureEnum", - "type": "string" - }, - "WaterFeatTypeEnum": { - "description": "", - "enum": [ - "fountain", - "pool", - "standing feature", - "stream", - "waterfall" - ], - "title": "WaterFeatTypeEnum", - "type": "string" - }, - "WeekdayEnum": { - "description": "", - "enum": [ - "Monday", - "Tuesday", - "Wednesday", - "Thursday", - "Friday", - "Saturday", - "Sunday" - ], - "title": "WeekdayEnum", - "type": "string" - }, - "WindowCondEnum": { - "description": "", - "enum": [ - "damaged", - "needs repair", - "new", - "rupture", - "visible wear" - ], - "title": "WindowCondEnum", - "type": "string" - }, - "WindowCoverEnum": { - "description": "", - "enum": [ - "blinds", - "curtains", - "none" - ], - "title": "WindowCoverEnum", - "type": "string" - }, - "WindowHorizPosEnum": { - "description": "", - "enum": [ - "left", - "middle", - "right" - ], - "title": "WindowHorizPosEnum", - "type": "string" - }, - "WindowLocEnum": { - "description": "", - "enum": [ - "north", - "south", - "east", - "west" - ], - "title": "WindowLocEnum", - "type": "string" - }, - "WindowMatEnum": { - "description": "", - "enum": [ - "clad", - "fiberglass", - "metal", - "vinyl", - "wood" - ], - "title": "WindowMatEnum", - "type": "string" - }, - "WindowTypeEnum": { - "description": "", - "enum": [ - "single-hung sash window", - "horizontal sash window", - "fixed window" - ], - "title": "WindowTypeEnum", - "type": "string" - }, - "WindowVertPosEnum": { - "description": "", - "enum": [ - "bottom", - "middle", - "top", - "low", - "high" - ], - "title": "WindowVertPosEnum", - "type": "string" - }, - "WorkflowExecutionActivity": { - "additionalProperties": false, - "description": "Represents an instance of an execution of a particular workflow", - "properties": { - "ended_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "execution_resource": { - "description": "Example: NERSC-Cori", - "type": "string" - }, - "git_url": { - "description": "Example: https://github.com/microbiomedata/mg_annotation/releases/tag/0.1", - "type": "string" - }, - "has_input": { - "description": "An input to a process.", - "items": { - "type": "string" - }, - "type": "array" - }, - "has_output": { - "description": "An output biosample to a processing step", - "items": { - "type": "string" - }, - "type": "array" - }, - "id": { - "description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI", - "pattern": "^(nmdc):wf-([0-9][a-z]{0,6}[0-9])-([A-Za-z0-9]{1,})(\\.[A-Za-z0-9]{1,})*(_[A-Za-z0-9_\\.-]+)?$", - "type": "string" - }, - "name": { - "description": "A human readable label for an entity", - "type": "string" - }, - "part_of": { - "description": "Links a resource to another resource that either logically or physically includes it.", - "items": { - "type": "string" - }, - "type": "array" - }, - "started_at_time": { - "format": "date-time", - "pattern": "^([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\3([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\17[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?$", - "type": "string" - }, - "type": { - "description": "An optional string that specifies the type object. This is used to allow for searches for different kinds of objects.", - "type": "string" - }, - "used": { - "type": "string" - }, - "was_associated_with": { - "description": "the agent/entity associated with the generation of the file", - "type": "string" - }, - "was_informed_by": { - "type": "string" - } - }, - "required": [ - "execution_resource", - "git_url", - "has_input", - "has_output", - "started_at_time", - "ended_at_time", - "was_informed_by" - ], - "title": "WorkflowExecutionActivity", - "type": "object" - } - }, - "$id": "https://microbiomedata/schema", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": false, - "metamodel_version": "1.7.0", - "properties": { - "activity_set": { - "description": "This property links a database object to the set of workflow activities.", - "items": { - "$ref": "#/$defs/WorkflowExecutionActivity" - }, - "type": "array" - }, - "biosample_set": { - "description": "This property links a database object to the set of samples within it.", - "items": { - "$ref": "#/$defs/Biosample" - }, - "type": "array" - }, - "collecting_biosamples_from_site_set": { - "items": { - "$ref": "#/$defs/CollectingBiosamplesFromSite" - }, - "type": "array" - }, - "data_object_set": { - "description": "This property links a database object to the set of data objects within it.", - "items": { - "$ref": "#/$defs/DataObject" - }, - "type": "array" - }, - "dissolving_activity_set": { - "items": { - "$ref": "#/$defs/DissolvingActivity" - }, - "type": "array" - }, - "field_research_site_set": { - "items": { - "$ref": "#/$defs/FieldResearchSite" - }, - "type": "array" - }, - "functional_annotation_set": { - "description": "This property links a database object to the set of all functional annotations", - "items": { - "$ref": "#/$defs/FunctionalAnnotation" - }, - "type": "array" - }, - "genome_feature_set": { - "description": "This property links a database object to the set of all features", - "items": { - "$ref": "#/$defs/GenomeFeature" - }, - "type": "array" - }, - "mags_activity_set": { - "description": "This property links a database object to the set of MAGs analysis activities.", - "items": { - "$ref": "#/$defs/MagsAnalysisActivity" - }, - "type": "array" - }, - "material_sample_set": { - "items": { - "$ref": "#/$defs/MaterialSample" - }, - "type": "array" - }, - "material_sampling_activity_set": { - "items": { - "$ref": "#/$defs/MaterialSamplingActivity" - }, - "type": "array" - }, - "metabolomics_analysis_activity_set": { - "description": "This property links a database object to the set of metabolomics analysis activities.", - "items": { - "$ref": "#/$defs/MetabolomicsAnalysisActivity" - }, - "type": "array" - }, - "metagenome_annotation_activity_set": { - "description": "This property links a database object to the set of metagenome annotation activities.", - "items": { - "$ref": "#/$defs/MetagenomeAnnotationActivity" - }, - "type": "array" - }, - "metagenome_assembly_set": { - "description": "This property links a database object to the set of metagenome assembly activities.", - "items": { - "$ref": "#/$defs/MetagenomeAssembly" - }, - "type": "array" - }, - "metaproteomics_analysis_activity_set": { - "description": "This property links a database object to the set of metaproteomics analysis activities.", - "items": { - "$ref": "#/$defs/MetaproteomicsAnalysisActivity" - }, - "type": "array" - }, - "metatranscriptome_activity_set": { - "description": "TODO", - "items": { - "$ref": "#/$defs/MetatranscriptomeActivity" - }, - "type": "array" - }, - "nom_analysis_activity_set": { - "description": "This property links a database object to the set of natural organic matter (NOM) analysis activities.", - "items": { - "$ref": "#/$defs/NomAnalysisActivity" - }, - "type": "array" - }, - "omics_processing_set": { - "description": "This property links a database object to the set of omics processings within it.", - "items": { - "$ref": "#/$defs/OmicsProcessing" - }, - "type": "array" - }, - "reaction_activity_set": { - "items": { - "$ref": "#/$defs/ReactionActivity" - }, - "type": "array" - }, - "read_based_taxonomy_analysis_activity_set": { - "description": "This property links a database object to the set of read based analysis activities.", - "items": { - "$ref": "#/$defs/ReadBasedTaxonomyAnalysisActivity" - }, - "type": "array" - }, - "read_qc_analysis_activity_set": { - "description": "This property links a database object to the set of read QC analysis activities.", - "items": { - "$ref": "#/$defs/ReadQcAnalysisActivity" - }, - "type": "array" - }, - "study_set": { - "description": "This property links a database object to the set of studies within it.", - "items": { - "$ref": "#/$defs/Study" - }, - "type": "array" - } - }, - "title": "NMDC", - "type": "object", - "version": "7.0.0" -} From 940c41c4b6cf143c362ba4995e49eaf59a671f5f Mon Sep 17 00:00:00 2001 From: Patrick Kalita Date: Thu, 18 Apr 2024 08:42:37 -0700 Subject: [PATCH 16/18] Upgrade to nmdc-schema 10.2.0 --- requirements/main.in | 2 +- requirements/main.txt | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/requirements/main.in b/requirements/main.in index f195a543..45998c84 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -24,7 +24,7 @@ mkdocs-jupyter mkdocs-material mkdocs-mermaid2-plugin motor -nmdc-schema==10.1.4 +nmdc-schema==10.2.0 openpyxl pandas passlib[bcrypt] diff --git a/requirements/main.txt b/requirements/main.txt index 72cd2d0d..fd18a174 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -62,6 +62,8 @@ beautifulsoup4==4.12.3 # -r requirements/main.in # mkdocs-mermaid2-plugin # nbconvert +bioregistry==0.10.158 + # via nmdc-schema black==24.2.0 # via shed bleach==6.1.0 @@ -95,6 +97,7 @@ click==8.1.7 # via # -r requirements/main.in # beanie + # bioregistry # black # dagster # dagster-webserver @@ -102,7 +105,9 @@ click==8.1.7 # linkml # linkml-runtime # mkdocs + # more-click # prefixcommons + # pystow # terminusdb-client # uvicorn colorama==0.4.6 @@ -121,6 +126,7 @@ cryptography==42.0.5 # via python-jose curies==0.7.7 # via + # bioregistry # linkml-runtime # prefixmaps dagit==1.6.8 @@ -435,6 +441,8 @@ mkdocs-mermaid2-plugin==0.6.0 # nmdc-schema mkdocs-redirects==1.2.1 # via nmdc-schema +more-click==0.1.2 + # via bioregistry motor==3.3.2 # via # -r requirements/main.in @@ -460,7 +468,7 @@ nbformat==5.9.2 # nbconvert nest-asyncio==1.6.0 # via ipykernel -nmdc-schema==10.1.4 +nmdc-schema==10.2.0 # via -r requirements/main.in notebook==7.1.1 # via jupyter @@ -566,6 +574,7 @@ pydantic==2.6.3 # via # -r requirements/main.in # beanie + # bioregistry # curies # dagster # fastapi @@ -608,6 +617,8 @@ pyshexc==0.9.1 # via # linkml # pyshex +pystow==0.5.4 + # via bioregistry pytest==8.0.2 # via pytest-logging pytest-logging==2015.11.4 @@ -700,6 +711,7 @@ regex==2023.12.25 requests==2.31.0 # via # -r requirements/main.in + # bioregistry # curies # dagster # dagster-graphql @@ -711,6 +723,7 @@ requests==2.31.0 # mkdocs-mermaid2-plugin # prefixcommons # pyshex + # pystow # requests-cache # requests-toolbelt # sphinx @@ -857,7 +870,9 @@ tornado==6.4 tqdm==4.66.2 # via # -r requirements/main.in + # bioregistry # dagster + # pystow # terminusdb-client traitlets==5.14.1 # via From 8bb3b081f1b21998a9acbb2810cd939d62f3711e Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Fri, 19 Apr 2024 15:27:04 -0400 Subject: [PATCH 17/18] feat: bi-directional graph association search also refactor start_query to simply start_id --- nmdc_runtime/api/endpoints/nmdcschema.py | 52 +++++++++--------------- nmdc_runtime/api/models/util.py | 3 +- 2 files changed, 20 insertions(+), 35 deletions(-) diff --git a/nmdc_runtime/api/endpoints/nmdcschema.py b/nmdc_runtime/api/endpoints/nmdcschema.py index 6726324e..5abf5f8f 100644 --- a/nmdc_runtime/api/endpoints/nmdcschema.py +++ b/nmdc_runtime/api/endpoints/nmdcschema.py @@ -25,12 +25,7 @@ FUSEKI_PASSWD, ) from nmdc_runtime.api.models.metadata import Doc -from nmdc_runtime.api.models.util import ( - ListRequest, - ListResponse, - AssociationsRequest, - AssociationDirectionEnum, -) +from nmdc_runtime.api.models.util import ListRequest, ListResponse, AssociationsRequest router = APIRouter() @@ -110,14 +105,9 @@ def get_nmdc_schema_associations( mdb: MongoDatabase = Depends(get_mongo_db), ): """ - For a given focus node of type nmdc:`start_type` that is found via `start_query`, + For a given focus node of type nmdc:`start_type` with id `start_id`, find target nodes of type nmdc:`target_type`. - The `downstream` direction flows from studies to data objects, whereas `upstream` is the reverse, - traversing along the direction of dependency. - - `start_query` uses [MongoDB-like language querying](https://www.mongodb.com/docs/manual/tutorial/query-documents/). - You should not use the Swagger UI for values of `limit` much larger than `1000`. Set `limit` to `0` (zero) for no limit. """ @@ -138,37 +128,33 @@ def get_nmdc_schema_associations( detail=f'target_type "{req.target_type}" is not a known nmdc-schema class', ) - filter_ = json_util.loads(check_filter(req.start_query)) - if mdb[start_type_collection_name].count_documents(filter_) > 1: + start_node = mdb[start_type_collection_name].find_one({"id": req.start_id}) + if start_node is None: raise HTTPException( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail=f'start_query "{req.start_query}" yields more than one entity.', + detail=f'start_id "{req.start_id}" not found.', ) - focus_node_ids = ( - [d["id"] for d in mdb[start_type_collection_name].find(filter_, ["id"])] - if filter_ - else None - ) - values_stmt = ( - f"VALUES ?focus_node {{ {' '.join(focus_node_ids)} }}" if focus_node_ids else "" - ) - start_pattern = f"?focus_node nmdc:type nmdc:{req.start_type} ." + values_stmt = f"VALUES ?start_node {{ {start_node['id']} }}" + start_pattern = f"?start_node nmdc:type nmdc:{req.start_type} ." target_pattern = f"?o nmdc:type nmdc:{req.target_type} ." - downstream_pattern = "?o nmdc:depends_on+ ?focus_node ." - upstream_pattern = "?focus_node nmdc:depends_on+ ?o ." - upstream_where = ( - f"""{values_stmt} {start_pattern} {target_pattern} {upstream_pattern}""" - ) - downstream_where = ( - f"""{values_stmt} {start_pattern} {target_pattern} {downstream_pattern}""" - ) + objects_that_can_reach_start_node_pattern = "?o nmdc:depends_on+ ?start_node ." + objects_reachable_from_start_node_pattern = "?start_node nmdc:depends_on+ ?o ." + where_clause = f""" + {values_stmt} {start_pattern} {target_pattern} + {{ + {{ {objects_that_can_reach_start_node_pattern} }} + UNION + {{ {objects_reachable_from_start_node_pattern} }} + }} + """ limit = f"LIMIT {req.limit}" if req.limit != 0 else "" query = f""" PREFIX nmdc: SELECT DISTINCT ?o WHERE {{ - {downstream_where if req.direction == AssociationDirectionEnum.downstream else upstream_where} + {where_clause} }} {limit}""" + print(query) sparql = SPARQLWrapper(f"{FUSEKI_HOST}/nmdc") sparql.user = FUSEKI_USER diff --git a/nmdc_runtime/api/models/util.py b/nmdc_runtime/api/models/util.py index 6b366b20..1952764b 100644 --- a/nmdc_runtime/api/models/util.py +++ b/nmdc_runtime/api/models/util.py @@ -47,9 +47,8 @@ class AssociationDirectionEnum(str, Enum): class AssociationsRequest(BaseModel): start_type: str - start_query: str + start_id: str target_type: str - direction: AssociationDirectionEnum = AssociationDirectionEnum.downstream limit: NonNegativeInt = 5 From f3cc66de5d9fcc3ead4fb7e808d301edce551d13 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Fri, 19 Apr 2024 16:14:25 -0400 Subject: [PATCH 18/18] feat: return docs, not just ids include also filter and projection request params for target nodes --- nmdc_runtime/api/endpoints/nmdcschema.py | 23 ++++++++++++++++++++++- nmdc_runtime/api/models/util.py | 2 ++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/nmdc_runtime/api/endpoints/nmdcschema.py b/nmdc_runtime/api/endpoints/nmdcschema.py index 5abf5f8f..a4e81201 100644 --- a/nmdc_runtime/api/endpoints/nmdcschema.py +++ b/nmdc_runtime/api/endpoints/nmdcschema.py @@ -23,6 +23,7 @@ FUSEKI_HOST, FUSEKI_USER, FUSEKI_PASSWD, + comma_separated_values, ) from nmdc_runtime.api.models.metadata import Doc from nmdc_runtime.api.models.util import ListRequest, ListResponse, AssociationsRequest @@ -163,7 +164,7 @@ def get_nmdc_schema_associations( sparql.setQuery(query) try: ret = sparql.queryAndConvert() - return [ + target_ids = [ b["o"]["value"].replace("https://w3id.org/nmdc/", "nmdc:") for b in ret["results"]["bindings"] ] @@ -172,6 +173,26 @@ def get_nmdc_schema_associations( status_code=status.HTTP_502_BAD_GATEWAY, detail=str(e), ) + target_filter = ( + json_util.loads(check_filter(req.target_filter)) if req.target_filter else {} + ) + if target_ids: + target_filter["id"] = {"$in": target_ids} + print(target_filter) + print(target_type_collection_name) + target_docs = list_resources( + ListRequest( + filter=json_util.dumps(target_filter), + projection=req.target_projection, + max_page_size=0, + ), + mdb, + target_type_collection_name, + ) + print(target_docs) + return [strip_oid(d) for d in target_docs["resources"]] + else: + return [] @router.get( diff --git a/nmdc_runtime/api/models/util.py b/nmdc_runtime/api/models/util.py index 1952764b..668a0117 100644 --- a/nmdc_runtime/api/models/util.py +++ b/nmdc_runtime/api/models/util.py @@ -49,6 +49,8 @@ class AssociationsRequest(BaseModel): start_type: str start_id: str target_type: str + target_filter: Optional[str] = None + target_projection: Optional[str] = None limit: NonNegativeInt = 5