finalise document attributes

dotimplement · Nov 3, 2024 · d30c9be · d30c9be
1 parent d28ee16
commit d30c9be
Show file tree

Hide file tree

Showing 7 changed files with 110 additions and 106 deletions.
diff --git a/docs/reference/pipeline/integrations.md b/docs/reference/pipeline/integrations.md
@@ -73,13 +73,13 @@ processed_doc = pipeline(doc)
 
 # Access spaCy annotations
 spacy_doc = processed_doc.get_spacy_doc()
-for token in spacy_doc:
+for token in spacy_doc:https://github.com/dotimplement/HealthChain
     print(f"Token: {token.text}, POS: {token.pos_}, Lemma: {token.lemma_}")
 ```
 
 ## HuggingFaceComponent
 
-The `HuggingFaceComponent` integrates Hugging Face Transformers models into your HealthChain pipeline.
+The `HuggingFaceComponent` integrates HuggingFace Transformers models into your HealthChain pipeline. Models can be browsed on the [HuggingFace website](https://huggingface.co/models). HuggingFace offers models for a wide range of different tasks, and while not all of these have been throughly tested for HealthChain compatability, we expect that all NLP models and tasks should be compatible. If you have an issues integrating any models please raise an issue on our [Github homepage](https://github.com/dotimplement/HealthChain)!
 
 ```python
 from healthchain.pipeline.components.integrations import HuggingFaceComponent

diff --git a/healthchain/io/containers.py b/healthchain/io/containers.py
@@ -62,124 +62,121 @@ def from_json(cls, json_str: str) -> "DataContainer":
 @dataclass
 class Document(DataContainer[str]):
     """
-    A container for document data, optionally wrapping a spaCy Doc object.
+    A container for document data with support for various NLP processing outputs.
 
     This class extends DataContainer to specifically handle textual document data.
-    It provides functionality to work with raw text, tokenized text, spaCy Doc objects,
-    and structured clinical data.
+    It provides functionality to work with raw text, tokenized text, and outputs from
+    various NLP libraries like spaCy, Hugging Face, and LangChain.
 
     Attributes:
         data (str): The raw text content of the document.
         preprocessed_text (str): The preprocessed version of the text.
-        tokens (List[str]): A list of individual tokens extracted from the text.
-        pos_tags (List[str]): A list of part-of-speech tags corresponding to the tokens.
-        entities (List[str]): A list of named entities identified in the text.
         ccd_data (Optional[CcdData]): An optional CcdData object containing structured clinical data.
         fhir_resources (Optional[CdsFhirData]): Optional FHIR resources data.
         cds_cards (Optional[List[Card]]): Optional list of CDS cards.
         cds_actions (Optional[List[Action]]): Optional list of CDS actions.
-        text (str): The current text content, which may be updated when setting a spaCy Doc.
-        _doc (SpacyDoc): An internal reference to the spaCy Doc object, if set.
 
     Methods:
-        __post_init__(): Initializes the text attribute and _doc reference.
-        _update_attributes(): Updates tokens, pos_tags, and entities from the spaCy Doc.
-        doc (property): Returns the spaCy Doc object if set, or raises an error.
-        set_spacy_doc(doc: SpacyDoc): Sets the spaCy Doc and updates related attributes.
+        add_spacy_doc(doc: SpacyDoc): Sets the spaCy Doc and updates related attributes.
+        add_huggingface_output(task: str, output: Any): Adds output from a Hugging Face model.
+        add_langchain_output(task: str, output: Any): Adds output from a LangChain process.
+        get_tokens() -> List[str]: Returns the document's tokens.
+        get_entities() -> List[Dict[str, Any]]: Returns the named entities with their details.
+        get_embeddings() -> Optional[List[float]]: Returns the document embeddings.
+        set_embeddings(embeddings: List[float]): Sets the document embeddings.
+        get_spacy_doc() -> Optional[SpacyDoc]: Returns the spaCy Doc if available.
+        get_huggingface_output(task: str) -> Any: Retrieves output for a specific Hugging Face task.
+        get_langchain_output(task: str) -> Any: Retrieves output for a specific LangChain task.
         word_count() -> int: Returns the number of tokens in the document.
-        char_count() -> int: Returns the number of characters in the text.
-        get_entities() -> List[Dict[str, Any]]: Returns a list of entities with their details.
-        update_ccd(new_problems: List[ProblemConcept], new_medications: List[MedicationConcept], new_allergies: List[AllergyConcept], overwrite: bool): Updates the existing CcdData object.
+        char_count() -> int: Returns the number of characters across all tokens.
+        update_ccd(...): Updates the CCD data with new clinical information.
         __iter__() -> Iterator[str]: Allows iteration over the document's tokens.
         __len__() -> int: Returns the word count of the document.
 
-    Raises:
-        ValueError: When attempting to access the spaCy Doc before it's set.
-
-    Note:
-        The spaCy Doc object needs to be set using a preprocessor before accessing
-        certain attributes and methods that depend on it.
+    Notes:
+        - The class supports multiple NLP processing pipelines and maintains their outputs separately.
+        - Basic tokenization is performed if tokens are not provided during initialization.
+        - Clinical data can be updated incrementally or overwritten completely.
     """
 
     preprocessed_text: str = field(default="")
-    tokens: List[str] = field(default_factory=list)
-    pos_tags: List[str] = field(default_factory=list)
-    entities: List[str] = field(default_factory=list)
     ccd_data: Optional[CcdData] = field(default=None)
     fhir_resources: Optional[CdsFhirData] = field(default=None)
     cds_cards: Optional[List[Card]] = field(default=None)
     cds_actions: Optional[List[Action]] = field(default=None)
-    entities: List[str] = field(default_factory=list)
 
-    # Third-party specific attributes
-    spacy_doc: Optional[SpacyDoc] = None
-    huggingface_results: Dict[str, Any] = field(default_factory=dict)
-    langchain_results: Dict[str, Any] = field(default_factory=dict)
+    # Internal attributes
+    _tokens: List[str] = field(default_factory=list)
+    _pos_tags: List[str] = field(default_factory=list)
+    _entities: List[Dict[str, Any]] = field(default_factory=list)
+    _embeddings: Optional[List[float]] = field(default=None)
+    _spacy_doc: Optional[SpacyDoc] = field(default=None)
+    _huggingface_results: Dict[str, Any] = field(default_factory=dict)
+    _langchain_results: Dict[str, Any] = field(default_factory=dict)
 
     def __post_init__(self):
+        """Initialize the document with basic tokenization if needed."""
         self.text = self.data
-        if not self.tokens:
-            self.tokens = self.text.split()  # Basic tokenization if not provided
+        if not self._tokens:
+            self._tokens = self.text.split()  # Basic tokenization if not provided
 
-    def add_spacy_doc(self, doc: SpacyDoc):
-        self.spacy_doc = doc
+    def add_spacy_doc(self, doc: SpacyDoc) -> None:
+        """Add a spaCy Doc and update related attributes."""
+        self._spacy_doc = doc
         self.text = doc.text
-        self.tokens = [token.text for token in doc]
-        self.entities = [
-            {
-                "text": ent.text,
-                "label": ent.label_,
-                "start": ent.start_char,
-                "end": ent.end_char,
-            }
-            for ent in doc.ents
-        ]
-
-    def add_huggingface_output(self, task: str, output: Any):
-        self.huggingface_results[task] = output
-
-    def add_langchain_output(self, task: str, output: Any):
-        self.langchain_results[task] = output
+        self._update_from_spacy()
+
+    def _update_from_spacy(self) -> None:
+        """Update internal attributes from spaCy Doc."""
+        if self._spacy_doc:
+            self._tokens = [token.text for token in self._spacy_doc]
+            self._pos_tags = [token.pos_ for token in self._spacy_doc]
+            self._entities = [
+                {
+                    "text": ent.text,
+                    "label": ent.label_,
+                    "start": ent.start_char,
+                    "end": ent.end_char,
+                }
+                for ent in self._spacy_doc.ents
+            ]
+
+    def add_huggingface_output(self, task: str, output: Any) -> None:
+        self._huggingface_results[task] = output
+
+    def add_langchain_output(self, task: str, output: Any) -> None:
+        self._langchain_results[task] = output
 
     def get_tokens(self) -> List[str]:
-        return self.tokens
+        return self._tokens
+
+    def set_entities(self, entities: List[Dict[str, Any]]) -> None:
+        self._entities = entities
 
     def get_entities(self) -> List[Dict[str, Any]]:
-        return self.entities
+        return self._entities
 
     def get_embeddings(self) -> Optional[List[float]]:
-        return self.embeddings
+        return self._embeddings
 
-    def set_embeddings(self, embeddings: List[float]):
-        self.embeddings = embeddings
+    def set_embeddings(self, embeddings: List[float]) -> None:
+        self._embeddings = embeddings
 
     def get_spacy_doc(self) -> Optional[SpacyDoc]:
-        return self.spacy_doc
+        return self._spacy_doc
 
     def get_huggingface_output(self, task: str) -> Any:
-        return self.huggingface_results.get(task)
+        return self._huggingface_results.get(task)
 
     def get_langchain_output(self, task: str) -> Any:
-        return self.langchain_results.get(task)
-
-    def _update_attributes(self):
-        self.tokens = [token.text for token in self._doc]
-        self.pos_tags = [token.pos_ for token in self._doc]
-        self.entities = [ent.text for ent in self._doc.ents]
-
-    @property
-    def doc(self) -> SpacyDoc:
-        if self._doc is None:
-            raise ValueError(
-                "spaCy Doc is not set. Use a preprocessor to set the spaCy Doc."
-            )
-        return self._doc
+        return self._langchain_results.get(task)
 
     def word_count(self) -> int:
-        return len(self.tokens)
+        return len(self._tokens)
 
     def char_count(self) -> int:
-        return len(self.text)
+        """Get the total number of characters across all tokens."""
+        return sum(len(token) for token in self._tokens)
 
     def update_ccd(
         self,
@@ -189,13 +186,13 @@ def update_ccd(
         overwrite: bool = False,
     ) -> None:
         """
-        Updates the existing CcdData object with new data.
+        Update the CCD data with new clinical information.
 
         Args:
-            new_problems (List[ProblemConcept]): List of new problem concepts to add or update.
-            new_medications (List[MedicationConcept]): List of new medication concepts to add or update.
-            new_allergies (List[AllergyConcept]): List of new allergy concepts to add or update.
-            overwrite (bool, optional): If True, replaces existing data; if False, appends new data. Defaults to False.
+            new_problems: List of new problem concepts to add or update.
+            new_medications: List of new medication concepts to add or update.
+            new_allergies: List of new allergy concepts to add or update.
+            overwrite: If True, replaces existing data; if False, appends new data.
 
         Raises:
             ValueError: If there is no existing CcdData object to update.
@@ -213,9 +210,11 @@ def update_ccd(
             self.ccd_data.allergies.extend(new_allergies)
 
     def __iter__(self) -> Iterator[str]:
-        return iter(self.tokens)
+        """Allow iteration over the document's tokens."""
+        return iter(self._tokens)
 
     def __len__(self) -> int:
+        """Return the word count of the document."""
         return self.word_count()
 
 

diff --git a/healthchain/pipeline/components/postprocessors.py b/healthchain/pipeline/components/postprocessors.py
@@ -44,7 +44,7 @@ def __call__(self, doc: Document) -> Document:
             If the entity_lookup is empty or the document has no 'entities' attribute,
             the document is returned unchanged.
         """
-        if not self.entity_lookup or not hasattr(doc, "entities"):
+        if not self.entity_lookup or not hasattr(doc, "_entities"):
             return doc
 
         refined_entities = []
@@ -54,6 +54,6 @@ def __call__(self, doc: Document) -> Document:
                 entity["text"] = self.entity_lookup[entity_text]
             refined_entities.append(entity)
 
-        doc.entities = refined_entities
+        doc.set_entities(refined_entities)
 
         return doc
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,7 @@ include = ["healthchain/templates/*"]
 python = ">=3.8,<3.12"
 pydantic = "^2.7.1"
 pandas = ">=1.0.0,<2.1.0"
-spacy = "3.7.6"
+spacy = "^3.7.6"
 requests = "^2.31.0"
 colorama = "^0.4.6"
 faker = "^25.1.0"

diff --git a/tests/pipeline/test_containers.py b/tests/pipeline/test_containers.py
@@ -10,7 +10,7 @@ def sample_document():
 def test_document_initialization(sample_document):
     assert sample_document.data == "This is a sample text for testing."
     assert sample_document.text == "This is a sample text for testing."
-    assert sample_document.tokens == [
+    assert sample_document.get_tokens() == [
         "This",
         "is",
         "a",
@@ -19,17 +19,16 @@ def test_document_initialization(sample_document):
         "for",
         "testing.",
     ]
-    assert sample_document.entities == []
-    assert sample_document.embeddings is None
+    assert sample_document.get_entities() == []
+    assert sample_document.get_embeddings() is None
 
 
 def test_document_word_count(sample_document):
     assert sample_document.word_count() == 7
 
 
 def test_document_char_count(sample_document):
-    with pytest.raises(AttributeError):
-        sample_document.char_count()  # Should raise error as spacy_doc is not set
+    assert sample_document.char_count() == 28
 
 
 def test_document_add_huggingface_output(sample_document):

diff --git a/tests/pipeline/test_postprocessor.py b/tests/pipeline/test_postprocessor.py
@@ -13,11 +13,13 @@ def test_text_postprocessor_initialization_and_processing():
 
     # Test processing with empty lookup
     doc = Document(data="")
-    doc.entities = [
-        {"text": "high blood pressure"},
-        {"text": "fever"},
-        {"text": "heart attack"},
-    ]
+    doc.set_entities(
+        [
+            {"text": "high blood pressure"},
+            {"text": "fever"},
+            {"text": "heart attack"},
+        ]
+    )
     processed_doc = processor(doc)
     assert [entity["text"] for entity in processed_doc.get_entities()] == [
         "high blood pressure",
@@ -31,11 +33,13 @@ def test_text_postprocessor_with_entities(sample_lookup):
 
     # Test with matching entities
     doc = Document(data="")
-    doc.entities = [
-        {"text": "high blood pressure"},
-        {"text": "fever"},
-        {"text": "heart attack"},
-    ]
+    doc.set_entities(
+        [
+            {"text": "high blood pressure"},
+            {"text": "fever"},
+            {"text": "heart attack"},
+        ]
+    )
     processed_doc = processor(doc)
     assert [entity["text"] for entity in processed_doc.get_entities()] == [
         "hypertension",
@@ -45,12 +49,14 @@ def test_text_postprocessor_with_entities(sample_lookup):
 
     # Test with mixed entities
     doc = Document(data="")
-    doc.entities = [
-        {"text": "high blood pressure"},
-        {"text": "cough"},
-        {"text": "heart attack"},
-        {"text": "fever"},
-    ]
+    doc.set_entities(
+        [
+            {"text": "high blood pressure"},
+            {"text": "cough"},
+            {"text": "heart attack"},
+            {"text": "fever"},
+        ]
+    )
     processed_doc = processor(doc)
     assert [entity["text"] for entity in processed_doc.get_entities()] == [
         "hypertension",
@@ -70,7 +76,7 @@ def test_text_postprocessor_edge_cases(sample_lookup):
 
     # Test with empty entities list
     doc = Document(data="")
-    doc.entities = []
+    doc.set_entities([])
     processed_doc = processor(doc)
     assert processed_doc.get_entities() == []
 

diff --git a/tests/test_pipeline_integrations.py b/tests/test_pipeline_integrations.py
@@ -49,7 +49,7 @@ def test_spacy_component(sample_document):
         mock_load.return_value = mock_instance
         component = SpacyComponent("en_core_web_sm")
         result = component(sample_document)
-        assert result.spacy_doc
+        assert result.get_spacy_doc()
 
 
 @pytest.mark.skipif(