diff --git a/OCR/dev-dockerfile b/OCR/dev-dockerfile index de642c8a..3a435867 100644 --- a/OCR/dev-dockerfile +++ b/OCR/dev-dockerfile @@ -1,10 +1,10 @@ -FROM python:3.10-bullseye +FROM python:3.10-slim -RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 tesseract-ocr-eng tesseract-ocr -y +RUN apt-get update && apt-get install libgl1 tesseract-ocr-eng tesseract-ocr -y # Use larger "best" training data, rather than "fast" # Python one-liner because we don't have curl or wget -RUN python3 -c 'from urllib.request import urlopen; print(urlopen("https://github.com/tesseract-ocr/tessdata_best/raw/refs/heads/main/eng.traineddata").read())' > /usr/share/tesseract-ocr/4.00/tessdata/eng.traineddata +RUN python3 -c 'from urllib.request import urlopen; print(urlopen("https://github.com/tesseract-ocr/tessdata_best/raw/refs/heads/main/eng.traineddata").read())' > /usr/share/tesseract-ocr/5/tessdata/eng.traineddata RUN pip install poetry @@ -12,5 +12,4 @@ COPY ./pyproject.toml /ocr/pyproject.toml COPY ./poetry.lock /ocr/poetry.lock WORKDIR /ocr -RUN poetry install - +RUN poetry install && poetry cache list | xargs -n1 poetry cache clear --all diff --git a/OCR/ocr/services/tesseract_ocr.py b/OCR/ocr/services/tesseract_ocr.py index 9ce4a78e..c796a065 100644 --- a/OCR/ocr/services/tesseract_ocr.py +++ b/OCR/ocr/services/tesseract_ocr.py @@ -36,6 +36,7 @@ def _guess_tessdata_path(wanted_lang="eng") -> bytes: "/usr/local/share/tesseract/tessdata", "/usr/share/tesseract/tessdata", "/usr/share/tesseract-ocr/4.00/tessdata", + "/usr/share/tesseract-ocr/5/tessdata", "/opt/homebrew/share/tessdata", "/opt/local/share/tessdata", ]