11 tests (#18)

* remove: illustrative test * tests: test strategies * tests: unit test instantiation * ci: enable codecov reporting * tests: remove hypothesis testing * tests: test creation module * ci: reduce CI runners * ci: temp suppress healthcheck (test running long) * Revert "ci: reduce CI runners" This reverts commit 105c6c4. * tests: dates module * refactor: add missing variables * tests: base test examples and strategies * Revert "tests: base test examples and strategies" This reverts commit b054bc2. * tests: base test examples and strategies * tests: base.test_reader.py * refactor: default to Gemma model * tests: theyworkforyou strategies & examples * refactor: add modules * tests: main module tests * refactor: call modules directly * docs: minor docstring updates * ci: update CodeCov version * ci: editable mode install
datasciencecampus · Aug 21, 2024 · 6dd108a · 6dd108a
1 parent 5ce8690
commit 6dd108a
Show file tree

Hide file tree

Showing 19 changed files with 2,103 additions and 24 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -41,19 +41,20 @@ jobs:
         if: |
           matrix.python-version == 3.11
         run: |
-          python -m pip install ".[dev]"
+          python -m pip install -e ".[dev]"
           python -m ruff check .
           python -m ruff format --check .
 
-    #   - name: Generate Report
-    #     run: |
-    #         coverage run -m pytest
-    #         coverage xml
-
-    #   - name: Upload Coverage to Codecov
-    #     uses: codecov/codecov-action@v3
-    #     with:
-    #         file: ./coverage.xml
-    #         flags: unittests
-    #         verbose: true
-    #         token: ${{secrets.CODECOV_TOKEN}}
+      - name: Generate Report
+        run: |
+            coverage run -m pytest
+            coverage xml
+            coverage report
+
+      - name: Upload Coverage to Codecov
+        uses: codecov/codecov-action@v4
+        with:
+            file: ./coverage.xml
+            flags: unittests
+            verbose: true
+            token: ${{secrets.CODECOV_TOKEN}}
diff --git a/src/parliai_public/__init__.py b/src/parliai_public/__init__.py
@@ -1,7 +1,13 @@
 """Using LLMs to capture coverage of organisations, people or themes in UK political debate."""
 
+from . import dates
+from .readers import Debates, WrittenAnswers
+
 __version__ = "0.0.1"
 
 __all__ = [
     "__version__",
+    "Debates",
+    "WrittenAnswers",
+    "dates",
 ]
diff --git a/src/parliai_public/_config/base.toml b/src/parliai_public/_config/base.toml
@@ -1,4 +1,5 @@
 urls = []
 keywords = ["Office for National Statistics", "ONS"]
-
+prompt = ""
+llm_name = ""
 outdir = ""
diff --git a/src/parliai_public/readers/base.py b/src/parliai_public/readers/base.py
@@ -46,7 +46,7 @@ class BaseReader(metaclass=abc.ABCMeta):
         Key terms to filter content on. By default, we look for any
         mention of `Office for National Statistics` or `ONS`.
     dates : list[dt.date], optional
-        List of dates from which to pull entries. The `parliai.dates`
+        List of dates from which to pull entries. The `parliai_public.dates`
         module may be of help. If not specified, only yesterday is used.
     outdir : str, default="out"
         Location of a directory in which to write outputs.
@@ -342,6 +342,8 @@ def _read_contents(self, soup: BeautifulSoup) -> dict:
     def instantiate_llm(self) -> None:
         """Instantiate LLM object per user specification."""
 
+        # Temporary override to default to Gemma (known/tested LLM)
+        self.llm_name = "gemma"
         self.llm = ChatOllama(model=self.llm_name, temperature=0)
 
         return None

diff --git a/src/parliai_public/readers/theyworkforyou.py b/src/parliai_public/readers/theyworkforyou.py
@@ -30,7 +30,7 @@ class Debates(BaseReader):
         Key terms to filter content on. By default, we look for any
         mention of `Office for National Statistics` or `ONS`.
     dates : list[dt.date], optional
-        List of dates from which to pull entries. The `parliai.dates`
+        List of dates from which to pull entries. The `parliai_public.dates`
         module may be of help. If not specified, only yesterday is used.
     outdir : str, default="out"
         Location of a directory in which to write outputs.
@@ -417,7 +417,7 @@ class WrittenAnswers(Debates):
         Key terms to filter content on. By default, we look for any
         mention of `Office for National Statistics` or `ONS`.
     dates : list[dt.date], optional
-        List of dates from which to pull entries. The `parliai.dates`
+        List of dates from which to pull entries. The `parliai_public.dates`
         module may be of help. If not specified, only yesterday is used.
     outdir : str, default="out"
         Location of a directory in which to write outputs.

diff --git a/tests/common.py b/tests/common.py
@@ -0,0 +1,162 @@
+"""
+Common strategies and utilities used across multiple test modules.
+
+Any real-world details or samples used as constants were correct when
+taken on 2024-03-06.
+"""
+
+import datetime as dt
+import string
+
+from dateutil import relativedelta as rd
+from hypothesis import strategies as st
+from langchain_community.chat_models import ChatOllama
+
+from parliai_public.readers.base import BaseReader
+
+
+class ToyReader(BaseReader):
+    """A toy class to allow testing our abstract base class."""
+
+    def retrieve_latest_entries(self):
+        """Allow testing with toy method."""
+
+    @staticmethod
+    def _read_metadata(url, soup):
+        """Allow testing with toy static method."""
+
+    @staticmethod
+    def _read_contents(soup):
+        """Allow testing with toy static method."""
+
+    def render(self, response, page):
+        """Allow testing with toy method."""
+
+    def _summary_template(self):
+        """Allow testing with toy method."""
+
+
+def where_what(reader):
+    """Get the right location and class for testing a reader."""
+
+    what = reader
+    if reader is ToyReader:
+        what = BaseReader
+
+    where = ".".join((what.__module__, what.__name__))
+
+    return where, what
+
+
+def default_llm() -> ChatOllama:
+    """Instantiate default LLM object for use in testing."""
+
+    llm = ChatOllama(
+        model="gemma",
+        temperature=0,
+        # max_output_tokens=2048,
+    )
+
+    return llm
+
+
+MPS_SAMPLE = [
+    (
+        "Bob Seely",
+        "Conservative, Isle of Wight",
+        "https://www.theyworkforyou.com/mp/25645/bob_seely/isle_of_wight",
+    ),
+    (
+        "Mark Logan",
+        "Conservative, Bolton North East",
+        "https://www.theyworkforyou.com/mp/25886/mark_logan/bolton_north_east",
+    ),
+    (
+        "Nigel Huddleston",
+        "Conservative, Mid Worcestershire",
+        "https://www.theyworkforyou.com/mp/25381/nigel_huddleston/mid_worcestershire",
+    ),
+    (
+        "Heather Wheeler",
+        "Conservative, South Derbyshire",
+        "https://www.theyworkforyou.com/mp/24769/heather_wheeler/south_derbyshire",
+    ),
+    (
+        "Ian Paisley Jnr",
+        "DUP, North Antrim",
+        "https://www.theyworkforyou.com/mp/13852/ian_paisley_jnr/north_antrim",
+    ),
+    (
+        "Matthew Offord",
+        "Conservative, Hendon",
+        "https://www.theyworkforyou.com/mp/24955/matthew_offord/hendon",
+    ),
+    (
+        "John Howell",
+        "Conservative, Henley",
+        "https://www.theyworkforyou.com/mp/14131/john_howell/henley",
+    ),
+    (
+        "Robert Goodwill",
+        "Conservative, Scarborough and Whitby",
+        "https://www.theyworkforyou.com/mp/11804/robert_goodwill/scarborough_and_whitby",
+    ),
+    (
+        "Naseem Shah",
+        "Labour, Bradford West",
+        "https://www.theyworkforyou.com/mp/25385/naseem_shah/bradford_west",
+    ),
+    (
+        "Amy Callaghan",
+        "Scottish National Party, East Dunbartonshire",
+        "https://www.theyworkforyou.com/mp/25863/amy_callaghan/east_dunbartonshire",
+    ),
+]
+
+GOV_DEPARTMENTS = [
+    "Attorney General's Office",
+    "Cabinet Office",
+    "Department for Business and Trade",
+    "Department for Culture, Media and Sport",
+    "Department for Education",
+    "Department for Energy Security and Net Zero",
+    "Department for Environment, Food and Rural Affairs",
+    "Department for Levelling Up, Housing and Communities",
+    "Department for Science, Innovation and Technology",
+    "Department for Transport",
+    "Department for Work and Pensions",
+    "Department of Health and Social Care",
+    "Export Credits Guarantee Department",
+    "Foreign, Commonwealth and Development Office",
+    "HM Treasury",
+    "Home Office",
+    "Ministry of Defence",
+    "Ministry of Justice",
+    "Northern Ireland Office",
+    "Office of the Advocate General for Scotland",
+    "Office of the Leader of the House of Commons",
+    "Office of the Leader of the House of Lords",
+    "Office of the Secretary of State for Scotland",
+    "Office of the Secretary of State for Wales",
+]
+
+SEARCH_TERMS = (
+    "ONS",
+    "Office for National Statistics",
+    "National Statistician",
+)
+
+TODAY = dt.date.today()
+ST_DATES = st.dates(TODAY - rd.relativedelta(years=4), TODAY)
+
+ST_FREE_TEXT = st.text(
+    string.ascii_letters + string.digits + ".:;!?-", min_size=1
+)
+
+MODEL_NAMES = ["llama3", "mistral", "openhermes"]
+
+GEMMA_PREAMBLES = [
+    "Sure! Here is the text you are looking for: \nMy right honourable friend...",
+    "Sure - here is the quote: My right honourable friend...",
+    "Sure!The following contains references to your search terms:My right honourable friend...",
+]
diff --git a/tests/readers/__init__.py b/tests/readers/__init__.py
@@ -0,0 +1 @@
+"""Unit tests for the reader classes."""
diff --git a/tests/readers/base/__init__.py b/tests/readers/base/__init__.py
@@ -0,0 +1 @@
+"""Tests for the BaseReader class."""
diff --git a/tests/readers/base/strategies.py b/tests/readers/base/strategies.py
@@ -0,0 +1,37 @@
+"""Composite strategies for testing the base reader."""
+
+from hypothesis import strategies as st
+from langchain.docstore.document import Document
+
+from ...common import SEARCH_TERMS, ST_FREE_TEXT
+
+
+@st.composite
+def st_terms_and_texts(draw, terms=SEARCH_TERMS):
+    """Create a possibly term-ridden string."""
+
+    term = draw(st.lists(st.sampled_from(terms), max_size=1))
+    string = draw(ST_FREE_TEXT)
+    add_in = draw(st.booleans())
+
+    text = " ".join((string, *term)) if add_in else string
+
+    return term, text
+
+
+@st.composite
+def st_chunks_contains_responses(draw):
+    """Create a set of chunks, booleans, and responses for a test."""
+
+    chunks = draw(
+        st.lists(
+            ST_FREE_TEXT.map(lambda x: Document(page_content=x)),
+            min_size=1,
+            max_size=5,
+        )
+    )
+
+    contains = [True, *(draw(st.booleans()) for _ in chunks[1:])]
+    responses = [draw(ST_FREE_TEXT) for con in contains if con is True]
+
+    return chunks, contains, responses
diff --git a/tests/readers/base/test_examples.py b/tests/readers/base/test_examples.py
@@ -0,0 +1,31 @@
+"""Example tests for the base reader class."""
+
+import requests
+from bs4 import BeautifulSoup
+
+from ...common import ToyReader
+
+
+def test_does_not_match_for_extra_abbreviations():
+    """Ensure the string checker does not flag ONS+ abbreviations."""
+
+    reader = ToyReader(urls=[], terms=["ONS"])
+    strings = (
+        "The ONSR is the Only National Sandwich Ranking.",
+        "I AM UNLUCKY! SOME MIGHT SAY I AM DONSY!",
+    )
+
+    for string in strings:
+        assert not reader.check_contains_terms(string)
+
+
+def test_81_add_ons_not_matched():
+    """Ensure the example from #81 does not match."""
+
+    reader = ToyReader([], terms=["ONS"])
+    url = "https://theyworkforyou.com/wrans/?id=2024-04-12.21381.h"
+
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, "html.parser")
+
+    assert not reader.check_contains_terms(soup.get_text())