Merge pull request #2 from tang-tf/stanza

Merging new Stanza and CoreNLP implementation by @tang-tf; sunsetting support for Python 3.6.
LeapBeyond · May 4, 2022 · d9a0f8a · d9a0f8a
2 parents 8b7939d + 4effa7d
commit d9a0f8a
Show file tree

Hide file tree

Showing 11 changed files with 407 additions and 64 deletions.
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.9, 3.8, 3.7, 3.6]
+        python-version: [3.9, 3.8, 3.7]
 
     steps:
     - uses: actions/checkout@v2
@@ -26,7 +26,7 @@ jobs:
 
     - name: Install apt dependencies
       run: |
-        sudo apt-get update && sudo apt-get install -y default-jre
+        sudo apt-get update && sudo apt-get install -y openjdk-8-jre
 
     - name: Cache restore pip
       id: cache-pip
@@ -51,7 +51,7 @@ jobs:
 
     - name: Download models and NLTK data
       run: |
-        python3 -c "import nltk; nltk.download('punkt')"
+        python -c "import nltk; nltk.download('punkt')"
 
     - name: Install package
       run: |
@@ -60,12 +60,10 @@ jobs:
 
     - name: Run tests
       run: |
-        python3 tests/run.py
+        python tests/run.py
 
     - name: Coveralls
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        COVERALLS_SERVICE_NAME: github-actions
-        COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
       run: |
-        if python3 --version | grep -q "Python 3.9." ; then coveralls ; fi
+        if python --version | grep -q "Python 3.9." ; then coveralls --service=github ; fi
diff --git a/README.rst b/README.rst
@@ -16,9 +16,31 @@ scrubadub_stanford
 ``scrubadub`` removes personally identifiable information from text.
 ``scrubadub_stanford`` is an extension that uses Stanford's NER model to remove personal information from text.
 
-This package contains one extra detectors:
+This package contains three flavours of interfacing with Stanford's NER models that can be used as a detector:
 
-* ``scrubadub_stanford.detectors.StanfordEntityDetector`` - A detector that uses the Stanford NER model to find locations, names and organizations.
+* ``scrubadub_stanford.detectors.StanfordEntityDetector`` - A detector that uses the Stanford NER model to find locations, names and organizations. Download size circa 250MB.
+* ``scrubadub_stanford.detectors.CoreNlpEntityDetector`` - The same interface as the ``StanfordEntityDetector``, but using Stanza's ``CoreNLPClient`` to interface with the CoreNLP Java Server. Download size circa 510MB.
+* ``scrubadub_stanford.detectors.StanzaEntityDetector`` - Similar to the above but using Stanza's native Python pipelines. Download size circa 210MB. No Java required. This is the recommended detector for speed and footprint.
+
+Prerequisites
+-------------
+
+A minimum version of Java Runtime Environment 8 is required for ``StanfordEntityDetector`` and ``CoreNlpEntityDetector``.
+Check which version by running::
+
+    $ java -version
+
+It should be at least version 1.8, but if not, please run the following commands:
+
+Linux::
+
+    $ sudo apt update
+    $ sudo apt install openjdk-8-jre
+
+MacOS::
+
+    $ brew tap adoptopenjdk/openjdk
+    $ brew install adoptopenjdk8-jre
 
 For more information on how to use this package see the
 `scrubadub stanford documentation <https://scrubadub.readthedocs.io/en/develop/names.html#stanford>`_

diff --git a/requirements/python b/requirements/python
@@ -1,2 +1,3 @@
-nltk
-scrubadub >= 2.0.0rc0
+nltk >= 3.3
+stanza
+scrubadub >= 2.0.0rc0
diff --git a/scrubadub_stanford/__init__.py b/scrubadub_stanford/__init__.py
@@ -1,6 +1,6 @@
 from . import detectors
 
-__version__ = VERSION = "2.0.0"
+__version__ = VERSION = "2.1.0"
 __all__ = [
     'detectors',
 ]
diff --git a/scrubadub_stanford/detectors/__init__.py b/scrubadub_stanford/detectors/__init__.py
@@ -1 +1,3 @@
 from .stanford import StanfordEntityDetector
+from .core_nlp import CoreNlpEntityDetector
+from .stanza import StanzaEntityDetector
diff --git a/scrubadub_stanford/detectors/core_nlp.py b/scrubadub_stanford/detectors/core_nlp.py
@@ -0,0 +1,149 @@
+"""
+This is a module that provides the same interface as StanfordEntityDetector,
+however it uses Stanford's own Stanza API to interface with the CoreNLP Java Server
+instead of using NLTK's interface.
+
+See https://stanfordnlp.github.io/CoreNLP/ for more details.
+This detector requires Java Runtime Environment 8+ and the Stanford CoreNLP installation is about 510MB.
+The default installation location is ``~/stanza_corenlp``, but a different location can be specified using
+environment variable ``CORENLP_HOME``.
+"""
+import os
+from pathlib import Path
+from typing import List, Dict, Type, Optional
+
+from stanza import install_corenlp
+from stanza.server import CoreNLPClient
+
+from scrubadub.detectors.catalogue import register_detector
+from scrubadub.detectors.base import Detector
+from scrubadub.filth.base import Filth
+from scrubadub.filth.name import NameFilth
+from scrubadub.filth.organization import OrganizationFilth
+from scrubadub.filth.location import LocationFilth
+
+from .utils import tag_helper
+
+# Default installation directory for CoreNLP download (500MB)
+HOME_DIR = str(Path.home())
+DEFAULT_CORENLP_DIR = os.getenv(
+    'CORENLP_HOME',
+    os.path.join(HOME_DIR, 'stanza_corenlp')
+)
+
+
+class CoreNlpEntityDetector(Detector):
+    """Search for people's names, organization's names and locations within text using the stanford 3 class model.
+
+    The three classes of this model can be enabled with the three arguments to the initialiser ``enable_person``,
+    ``enable_organization`` and ``enable_location``.
+    An example of their usage is given below.
+
+    >>> from scrubadub import Scrubber
+    >>> from scrubadub_stanford.detectors import CoreNlpEntityDetector
+    >>>
+    >>> detector = CoreNlpEntityDetector(
+    ...     enable_person=True, enable_organization=True, enable_location=False
+    ... )
+    >>> scrubber = Scrubber(detector_list=[detector])
+    >>> scrubber.clean('Jane has an appointment at the National Hospital of Neurology and Neurosurgery today.')
+    '{{NAME}} has an appointment at the {{ORGANIZATION}} and Neurosurgery today.'
+    """
+    filth_cls = Filth
+    name = "corenlp"
+
+    def __init__(self, enable_person: bool = True, enable_organization: bool = True, enable_location: bool = False,
+                 ignored_words: List[str] = None,
+                 **kwargs):
+        """Initialise the ``Detector``.
+
+        :param enable_person: To tag entities that are recognised as person, defaults to ``True``.
+        :type enable_person: bool
+        :param enable_organization: To tag entities that are recognised as organisations, defaults to ``True``.
+        :type enable_organization: bool
+        :param enable_location: To tag entities that are recognised as locations, defaults to ``False``.
+        :type enable_location: bool
+        :param ignored_words: A list of words that will be ignored by the NER tagging. Defaults to `['tennant']`.
+        :type ignored_words: List[str]
+        :param name: Overrides the default name of the :class:``Detector``
+        :type name: str, optional
+        :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
+                       underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
+        :type locale: str, optional
+        """
+        self.filth_lookup = {}  # type: Dict[str, Type[Filth]]
+        if enable_person:
+            self.filth_lookup['PERSON'] = NameFilth
+        if enable_organization:
+            self.filth_lookup['ORGANIZATION'] = OrganizationFilth
+        if enable_location:
+            self.filth_lookup['LOCATION'] = LocationFilth
+        self.ignored_words = ['tennant'] if ignored_words is None else ignored_words
+
+        super(CoreNlpEntityDetector, self).__init__(**kwargs)
+
+    @staticmethod
+    def _check_downloaded(directory: str = DEFAULT_CORENLP_DIR) -> bool:
+        """Check for a downloaded Stanford CoreNLP.
+
+        :param directory: The directory where CoreNLP will be unzipped and installed to, default is ``stanza_corenlp``
+                          in the home directory, else specified by the environment variable ``CORENLP_HOME``.
+        :type directory: str
+        :return: ``True`` if the directory exists and is not empty.
+        :rtype: bool
+        """
+        directory = os.path.expanduser(directory)
+        if os.path.exists(directory) and len(os.listdir(directory)) > 0:
+            return True
+        return False
+
+    @staticmethod
+    def _download(directory: str = DEFAULT_CORENLP_DIR) -> None:
+        """Download and install CoreNLP to the specified directory.
+
+        :param directory: The directory where CoreNLP will be unzipped and installed to, default is ``stanza_corenlp``
+                          in the home directory, else specified by the environment variable ``CORENLP_HOME``.
+        :type directory: str
+        """
+        install_corenlp(directory)
+
+    def iter_filth(self, text, document_name: Optional[str] = None):
+        """Yields discovered filth in the provided ``text``.
+
+        :param text: The dirty text to clean.
+        :type text: str
+        :param document_name: The name of the document to clean.
+        :type document_name: Optional[str]
+        :return: An iterator to the discovered :class:`Filth`
+        :rtype: Iterator[:class:`Filth`]
+        """
+
+        if not self._check_downloaded():
+            self._download()
+
+        with CoreNLPClient(be_quiet=True) as client:
+            annotation = client.annotate(text)
+
+        # List of tuples of token/NER tag for each token of each annotated sentence:
+        tags = [(token.value, token.ner) for sentence in annotation.sentence for token in sentence.token]
+        # Loop over all tagged words and join contiguous words tagged as people
+        return tag_helper(text=text, tags=tags, filth_lookup=self.filth_lookup, ignored_words=self.ignored_words,
+                          name=self.name, locale=self.locale, document_name=document_name)
+
+    @classmethod
+    def supported_locale(cls, locale: str) -> bool:
+        """Returns true if this ``Detector`` supports the given locale.
+
+        :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
+                       underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
+        :type locale: str
+        :return: ``True`` if the locale is supported, otherwise ``False``
+        :rtype: bool
+        """
+        language, region = cls.locale_split(locale)
+        return language in ['en']
+
+
+register_detector(CoreNlpEntityDetector)
+
+__all__ = ["CoreNlpEntityDetector"]
diff --git a/scrubadub_stanford/detectors/stanford.py b/scrubadub_stanford/detectors/stanford.py
@@ -3,10 +3,9 @@
 
 See https://nlp.stanford.edu/software/CRF-NER.html for more details on the Stanford CRF NER Tagger
 
-This detector requires java and the python package `nltk`.
-The Stanford CRF NER Tagger will be downloaded to `~/.scrubadub/stanford_ner` and takes around 250MB.
+This detector requires Java 8 Runtime Environment and above, and the Python package ``NLTK``.
+The Stanford CRF NER Tagger will be downloaded to ``~/.scrubadub/stanford_ner`` and takes around 250MB.
 """
-import re
 import os
 import pathlib
 import zipfile
@@ -19,7 +18,7 @@
         'Please run: pip install scrubadub[stanford]'
     )
 
-from typing import Dict, Type, Optional, List
+from typing import Dict, Type, Optional
 
 from scrubadub.detectors.catalogue import register_detector
 from scrubadub.detectors.base import Detector
@@ -28,6 +27,8 @@
 from scrubadub.filth.organization import OrganizationFilth
 from scrubadub.filth.location import LocationFilth
 
+from .utils import tag_helper
+
 
 class ScrubadubStanfordNERTagger(nltk.tag.StanfordNERTagger):
     """Utility class to control options that the StanfordNERTagger is run with"""
@@ -53,8 +54,8 @@ def _cmd(self):
 class StanfordEntityDetector(Detector):
     """Search for people's names, organization's names and locations within text using the stanford 3 class model.
 
-    The three classes of this model can be enabled with the three arguments to the inialiser `enable_person`,
-    `enable_organization` and `enable_location`.
+    The three classes of this model can be enabled with the three arguments to the initialiser ``enable_person``,
+    ``enable_organization`` and ``enable_location``.
     An example of their usage is given below.
 
     >>> import scrubadub, scrubadub_stanford
@@ -125,7 +126,7 @@ def _check_downloaded(self):
         return True
 
     def _download(self):
-        """Download and extract the eneeded files from the Stanford NER tagger"""
+        """Download and extract the needed files from the Stanford NER tagger"""
         # Make the data directory
         pathlib.Path(self.stanford_prefix).mkdir(parents=True, exist_ok=True)
 
@@ -173,50 +174,8 @@ def iter_filth(self, text, document_name: Optional[str] = None):
 
         tokens = nltk.tokenize.word_tokenize(text)
         tags = self.stanford_tagger.tag(tokens)
-
-        grouped_tags = {}  # type: Dict[str, List[str]]
-        previous_tag = None
-
-        # Loop over all tagged words and join contiguous words tagged as people
-        for tag_text, tag_type in tags:
-            if tag_type in self.filth_lookup.keys() and not any(
-                    [tag_text.lower().strip() == ignored.lower().strip() for ignored in self.ignored_words]):
-                if previous_tag == tag_type:
-                    grouped_tags[tag_type][-1] = grouped_tags[tag_type][-1] + ' ' + tag_text
-                else:
-                    grouped_tags[tag_type] = grouped_tags.get(tag_type, []) + [tag_text]
-
-                previous_tag = tag_type
-            else:
-                previous_tag = None
-
-        # for each set of tags, de-dupe and convert to regex
-        for tag_type, tag_list in grouped_tags.items():
-            grouped_tags[tag_type] = [
-                r'\b' + re.escape(person).replace(r'\ ', r'\s+') + r'\b'
-                for person in set(tag_list)
-            ]
-
-        # Now look for these in the original document
-        for tag_type, tag_list in grouped_tags.items():
-            for tag_regex in tag_list:
-                try:
-                    pattern = re.compile(tag_regex, re.MULTILINE | re.UNICODE)
-                except re.error:
-                    print(tag_regex)
-                    raise
-                found_strings = re.finditer(pattern, text)
-
-                # Iterate over each found string matching this regex and yield some filth
-                for instance in found_strings:
-                    yield self.filth_lookup[tag_type](
-                        beg=instance.start(),
-                        end=instance.end(),
-                        text=instance.group(),
-                        detector_name=self.name,
-                        document_name=document_name,
-                        locale=self.locale,
-                    )
+        return tag_helper(text=text, tags=tags, filth_lookup=self.filth_lookup, ignored_words=self.ignored_words,
+                          name=self.name, locale=self.locale, document_name=document_name)
 
     @classmethod
     def supported_locale(cls, locale: str) -> bool: