diff --git a/Makefile b/Makefile index d5773a49..aaeffd06 100644 --- a/Makefile +++ b/Makefile @@ -72,7 +72,7 @@ install: .PHONY: install # --- -# Project Syncronization +# Project Synchronization # --- # # check we are up to date @@ -95,7 +95,7 @@ site: clean schema-clean src/nmdc_submission_schema/schema/nmdc_submission_schem gen-project gendoc project/json/nmdc_submission_schema.json %.yaml: gen-project -# make deploy has been depricated by an updated .github/workflows/deploy-docs.yaml +# make deploy has been deprecated by an updated .github/workflows/deploy-docs.yaml #deploy: all mkd-gh-deploy # generates all project files diff --git a/project.Makefile b/project.Makefile index f4dc8981..a6510310 100644 --- a/project.Makefile +++ b/project.Makefile @@ -25,11 +25,6 @@ schema-clean: modifications-clean sheets_and_friends-clean examples-clean post-c cp placeholder.md local mkdir -p examples/output -# todo: fewer enums -# todo: use booleans for yes/no enumerations -# todo: some numbers appear as strings in the schema (just examples? check for minimum value etc) -# todo maximum value for pH has to be an int? - sheets_and_friends-clean: rm -rf sheets_and_friends/yaml_out/with_shuttles.yaml @@ -54,7 +49,6 @@ local/with_shuttles_yq.yaml: local/with_shuttles.yaml # ControlledTermValue: what about multivalued CTVs? don't see any besides chem_administration above at this time # for water, can depth be a point, a range, or both? - # globally replace structured ranges with strings. # undoes some of the range alterations that nmdc-schema makes when importing MIxS terms # future versions of the nmdc-schema might just use strings, too @@ -141,14 +135,12 @@ local/with_shuttles_yq.yaml: local/with_shuttles.yaml yq -i 'del(.slots.[] | select(.name == "was_informed_by"))' $@ - modifications-clean: rm -rf sheets_and_friends/yaml_out/with_modifications.yaml local/nmdc.yaml: wget -O $@ https://raw.githubusercontent.com/microbiomedata/nmdc-schema/v11.0.1/nmdc_schema/nmdc_materialized_patterns.yaml -# sheets-for-nmdc-submission-schema_validation_converter_empty.tsv local/with_modifications.yaml: local/with_shuttles_yq.yaml \ sheets_and_friends/tsv_in/modifications_long.tsv \ sheets_and_friends/tsv_in/validation_converter.tsv \ @@ -196,9 +188,6 @@ src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml: local/with_modifi yq -i '(.slots.[] | select(.range == "TextValue") | .range) = "string"' $@ yq -i '(.slots.[] | select(.range == "TimestampValue") | .range) = "string"' $@ -# yq -i '(.slots.[] | select(has("range") | not ) | .range ) = "string"' $@ -# yq -i '(.classes.[].slot_usage.[] | select(has("range") | not ) | .range ) = "string"' $@ - yq -i '(.slots.[] | select(.name == "sample_link") | .range ) = "string"' $@ yq -i '(.slots.[] | select(.range == "string") | .multivalued ) = false' $@ @@ -208,9 +197,6 @@ src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml: local/with_modifi yq -i 'del(.classes.[].slot_usage.[] | select(.multivalued == "false").inlined)' $@ yq -i 'del(.classes.[].slot_usage.[] | select(.multivalued == "false").inlined_as_list)' $@ -# yq -i '(.slots.[] | select(.name == "dna_dnase") | .range) = "boolean"' $@ -# yq -i '(.classes.[].slot_usage.[] | select(.name == "dna_dnase") | .range) = "boolean"' $@ - yq -i '(.slots.[] | select(.name == "oxy_stat_samp") | .range) = "OxyStatSampEnum"' $@ yq -i '(.classes.[].slot_usage.[] | select(.name == "oxy_stat_samp") | .range) = "OxyStatSampEnum"' $@ @@ -230,8 +216,6 @@ examples-clean: examples/output/README.md: src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml \ src/data/invalid src/data/valid mkdir -p $(dir $@) - # RDF/TTL generation is failing - # https://github.com/microbiomedata/submission-schema/issues/13 $(RUN) linkml-run-examples \ --output-formats json \ --output-formats yaml \ @@ -256,20 +240,45 @@ dh-build: project/json/nmdc_submission_schema.json ### -## todo frozen content in src/data/data_harmonizer_io has been removed -## todo find a better home for the se scripts if they are still of any use #src/data/data_harmonizer_io/soil_data.json: src/data/data_harmonizer_io/soil_for_linkml.json # $(RUN) linkml-json2dh \ # --input-file $< \ # --output-dir $(dir $@) -local/usage_template.tsv: src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml - mkdir -p $(@D) - $(RUN) generate_and_populate_template \ - --base-class slot_definition \ - --columns-to-insert class \ - --columns-to-insert slot \ - --destination-template $@ \ - --meta-model-excel-file local/meta.xlsx \ - --meta-path https://raw.githubusercontent.com/linkml/linkml-model/main/linkml_model/model/schema/meta.yaml \ - --source-schema-path $< +## depends on an old version of schemasheets? +## could be replaced with https://github.com/linkml/schemasheets/blob/bdde85d74637ae116fb5fd64a2e47999a1aebdfb/pyproject.toml#L36C1-L36C20 +#local/usage_template.tsv: src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml +# mkdir -p $(@D) +# $(RUN) generate_and_populate_template \ +# --base-class slot_definition \ +# --columns-to-insert class \ +# --columns-to-insert slot \ +# --destination-template $@ \ +# --meta-model-excel-file local/meta.xlsx \ +# --meta-path https://raw.githubusercontent.com/linkml/linkml-model/main/linkml_model/model/schema/meta.yaml \ +# --source-schema-path $< + +local/soil-env-broad-scale-oak-only.txt: + $(RUN) runoak --input sqlite:obo:envo descendants --predicates i "terrestrial biome" > $@ + +local/soil-env-medium-oak-only.txt: + $(RUN) runoak --input sqlite:obo:envo info .desc//p=i "soil" .not .desc//p=i "enriched soil" > $@ # also subtract "soils whose differential could appear in the env_local_scale slot # https://incatools.github.io/ontology-access-kit/> $@ https://incatools.github.io/ontology-access-kit/howtos/use-oak-expression-language.html + +local/soil-env-medium-oak-only-relations.tsv: + $(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i "soil" .not .desc//p=i "enriched soil" > $@ + +local/soil-env-medium-oak-only-relations-no-sco.tsv: local/soil-env-medium-oak-only-relations.tsv + awk -F'\t' '$$3 != "rdfs:subClassOf"' $< > $@ + +local/soil-env-medium-oak-only-relations-no-sco-with-dist.tsv: local/soil-env-medium-oak-only-relations-no-sco.tsv + $(RUN) python tsv_text_diff.py \ + --input-file $< \ + --output-file $@ \ + --left-column "subject_label" \ + --right-column "object_label" + +local/soil-env-medium-oak-only-relation-label-counts.txt: local/soil-env-medium-oak-only-relations.tsv + awk -F'\t' '{print $$4}' $< | sort | uniq -c | sort -nr > $@ + +local/soil-env-medium-oak-only-relation-id-counts.txt: local/soil-env-medium-oak-only-relations.tsv + awk -F'\t' '{print $$3}' $< | sort | uniq -c | sort -nr > $@ \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index e80790e6..d4316eaf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,13 +20,14 @@ include = [ [tool.poetry.dependencies] python = "^3.9" linkml-runtime = "^1.6.2" +textdistance = "^4.6.3" [tool.poetry.group.dev.dependencies] exhaustion-check = "^0.1.3" linkml = "^1.7.10" mkdocs-material = "^9.1.2" mkdocs-mermaid2-plugin = "^0.6.0" -oaklib = "^0.5.28" # https://pypi.org/project/oaklib/0.5.6/ +oaklib = "^0.5.28" rdflib = "^6.2.0" sheets-and-friends = "^0.5.4" @@ -53,12 +54,6 @@ patterns = [ "(^version:\\s*['\\\"]?)[^'\\\"]*?(['\\\"]?)$" ] - -## /Users/MAM/Documents/gitrepos/sheets_and_friends -## /home/mark/gitrepos/sheets_and_friends -#sheets-and-friends = { path = "/home/mark/gitrepos/sheets_and_friends", develop = true } - - [build-system] requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"] build-backend = "poetry_dynamic_versioning.backend"