Skip to content

Commit

Permalink
lab3
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Feb 19, 2024
1 parent 420054d commit 2d3fdcd
Showing 1 changed file with 56 additions and 0 deletions.
56 changes: 56 additions & 0 deletions 2024/lab3/Lab3 - KG from unstructured data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1082,6 +1082,62 @@
"source": [
"## 3. 🕸️ Generate RDF"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from rdflib import URIRef, Literal, RDFS, Namespace, RDF, XSD, DCTERMS\n",
"\n",
"score_threshold = 0.8\n",
"\n",
"BASE_NS = Namespace(\"https://w3id.org/myannotations/\")\n",
"TAO = Namespace(\"http://pubannotation.org/ontology/tao.owl#\")\n",
"\n",
"g = Graph(store=\"Oxigraph\")\n",
"g.bind(\"\", BASE_NS)\n",
"g.bind(\"tao\", TAO)\n",
"g.bind(\"doid\", Namespace(\"http://purl.obolibrary.org/obo/DOID_\"))\n",
"\n",
"annotated_text_uri = BASE_NS[\"document0\"]\n",
"g.add((annotated_text_uri, RDF.type, TAO.text_document))\n",
"g.add((annotated_text_uri, RDFS.label, Literal(annotate_text)))\n",
"\n",
"sentence_uris = {}\n",
"for i, ent in enumerate(doc.ents):\n",
" hits = search_disease(ent.text)\n",
" # Only consider the first match\n",
" if len(hits) > 0 and hits[0].score > score_threshold:\n",
" print(f\"{ent.text} > {hits[0].payload['label']} ({hit.payload['uri']}) [{hits[0].score}]\")\n",
" # Add the text annotation to the RDF graph\n",
" annot_uri = BASE_NS[f\"match{i}\"]\n",
" g.add((annot_uri, RDF.type, TAO.text_span))\n",
" g.add((annot_uri, TAO.has_value, Literal(ent.text)))\n",
" g.add((annot_uri, TAO.begins_at, Literal(ent.start_char, datatype=XSD.integer)))\n",
" g.add((annot_uri, TAO.ends_at, Literal(ent.end_char, datatype=XSD.integer)))\n",
" g.add((annot_uri, TAO.parts_of, annotated_text_uri))\n",
"\n",
" # Create or retrieve the URI for the sentence\n",
" sentence_text = ent.sent.text\n",
" if sentence_text not in sentence_uris:\n",
" sentence_uri = BASE_NS[f\"sentence{len(sentence_uris)}\"]\n",
" sentence_uris[sentence_text] = sentence_uri\n",
" g.add((sentence_uri, RDF.type, TAO.text_span))\n",
" g.add((sentence_uri, TAO.has_value, Literal(sentence_text)))\n",
" g.add((sentence_uri, TAO.parts_of, annotated_text_uri))\n",
" # We use a different predicate to link the annotation to the sentence\n",
" g.add((annot_uri, DCTERMS.isPartOf, sentence_uris[sentence_text]))\n",
"\n",
" # Add link to the matching concept\n",
" concept_uri = URIRef(hits[0].payload['uri'])\n",
" g.add((annot_uri, TAO.denotes, concept_uri))\n",
" g.add((concept_uri, RDFS.label, Literal(hits[0].payload['label'])))\n",
"\n",
"g.serialize(\"diseases_annotations_kg.ttl\", format=\"ttl\")\n",
"# print(g.serialize(format=\"ttl\"))"
]
}
],
"metadata": {
Expand Down

0 comments on commit 2d3fdcd

Please sign in to comment.