Use better tesseract training dataset (#459)

* Download tessdata-best instead of tessdata-fast This should improve OCR performance using tesseract with only a mild increase in runtime and container size. * Drop unused `cdifflib`
CDCgov · Dec 9, 2024 · c8d2b39 · c8d2b39
1 parent 6863721
commit c8d2b39
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 23 deletions.
diff --git a/OCR/Dockerfile b/OCR/Dockerfile
@@ -1,14 +1,18 @@
-FROM python:3.10-bullseye
+FROM python:3.10-slim
 
-RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 tesseract-ocr-eng tesseract-ocr -y
+RUN apt-get update && apt-get install libgl1 tesseract-ocr-eng tesseract-ocr -y
+
+# Use larger "best" training data, rather than "fast"
+# Python one-liner because we don't have curl or wget
+RUN python3 -c 'from urllib.request import urlopen; open("/usr/share/tesseract-ocr/5/tessdata/eng.traineddata", "wb").write(urlopen("https://github.com/tesseract-ocr/tessdata_best/raw/refs/heads/main/eng.traineddata").read())'
 
 RUN pip install poetry
 
 COPY /ocr /ocr
 COPY poetry.lock .
 COPY pyproject.toml .
 
-RUN poetry install --without dev
+RUN poetry install --only main && poetry cache list | xargs -n1 poetry cache clear --all
 
 ENTRYPOINT ["poetry", "run", "api"]
 

diff --git a/OCR/dev-dockerfile b/OCR/dev-dockerfile
@@ -1,12 +1,15 @@
-FROM python:3.10-bullseye
+FROM python:3.10-slim
 
-RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 tesseract-ocr-eng tesseract-ocr -y
+RUN apt-get update && apt-get install libgl1 tesseract-ocr-eng tesseract-ocr -y
+
+# Use larger "best" training data, rather than "fast"
+# Python one-liner because we don't have curl or wget
+RUN python3 -c 'from urllib.request import urlopen; open("/usr/share/tesseract-ocr/5/tessdata/eng.traineddata", "wb").write(urlopen("https://github.com/tesseract-ocr/tessdata_best/raw/refs/heads/main/eng.traineddata").read())'
 
 RUN pip install poetry
 
 COPY ./pyproject.toml /ocr/pyproject.toml
 COPY ./poetry.lock /ocr/poetry.lock
 
 WORKDIR /ocr
-RUN poetry install
-
+RUN poetry install && poetry cache list | xargs -n1 poetry cache clear --all
diff --git a/OCR/ocr/services/tesseract_ocr.py b/OCR/ocr/services/tesseract_ocr.py
@@ -36,6 +36,7 @@ def _guess_tessdata_path(wanted_lang="eng") -> bytes:
             "/usr/local/share/tesseract/tessdata",
             "/usr/share/tesseract/tessdata",
             "/usr/share/tesseract-ocr/4.00/tessdata",
+            "/usr/share/tesseract-ocr/5/tessdata",
             "/opt/homebrew/share/tessdata",
             "/opt/local/share/tessdata",
         ]

diff --git a/OCR/poetry.lock b/OCR/poetry.lock
diff --git a/OCR/pyproject.toml b/OCR/pyproject.toml
@@ -10,13 +10,12 @@ python = "^3.10"
 numpy = "^1.26.4"
 opencv-python = "^4.9.0.80"
 levenshtein = "^0.25.1"
-cdifflib = "^1.2.6"
 fastapi = {extras = ["standard"], version = "^0.112.1"}
 transformers = {extras = ["torch"], version = "^4.45.1"}
 pillow = "^10.3.0"
-
 datasets = "^3.0.1"
 tesserocr = "^2.7.1"
+
 [tool.poetry.group.dev.dependencies]
 lxml = "^5.3.0"
 docopt = "^0.6.2"