Skip to content

Commit

Permalink
Merge pull request #103 from VukManojlovic/CTX-5257
Browse files Browse the repository at this point in the history
CTX-5257: Changed date_of_birth output to return object with numerical fields day, month and year
  • Loading branch information
igorperic17 authored Jan 22, 2024
2 parents 4d64f48 + 90eb38b commit 2c9437f
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 21 deletions.
41 changes: 24 additions & 17 deletions tasks/document-ocr-fn/resources/function/ocr.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
from dateutil import parser

from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

import pytesseract
import easyocr
import numpy as np


# EasyOCR
reader = easyocr.Reader(["en"])
# TrOCR
modelVersion = "microsoft/trocr-base-printed"
processor = TrOCRProcessor.from_pretrained(modelVersion)
model = VisionEncoderDecoderModel.from_pretrained(modelVersion)
Expand All @@ -20,18 +15,30 @@ def trOCR(image: Image.Image) -> str:
return processor.batch_decode(generatedIds, skip_special_tokens = True)[0] # type: ignore


def performOCR(images: list[Image.Image], classes: list[str]) -> list[dict[str, str]]:
detections: list[dict[str, str]] = []
def performOCR(images: list[Image.Image], classes: list[str]) -> dict[str, str]:
detections: dict[str, str] = {}
for i, image in enumerate(images):
# tesseractOutput = pytesseract.image_to_string(image).replace("\n", " ")
# easyOcrOutput = " ".join([e[1] for e in reader.readtext(np.array(image))])
trOcrOutput = trOCR(image)

detections.append({
"class": classes[i],
# "tesseract": tesseractOutput,
# "easyOCR": easyOcrOutput,
"trOCR": trOcrOutput
})
if classes[i] == "date_of_birth":
detections["date_of_birth_raw"] = trOcrOutput

try:
trOcrDateTime = parser.parse(trOcrOutput)
trOcrOutputJson = {
"year": trOcrDateTime.year,
"month": trOcrDateTime.month,
"day": trOcrDateTime.day
}
except ValueError as e:
trOcrOutputJson = {
"year": None,
"month": None,
"day": None
}

trOcrOutput = trOcrOutputJson

detections[classes[i]] = trOcrOutput

return detections
6 changes: 2 additions & 4 deletions tasks/document-ocr-fn/resources/function/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@ seaborn>=0.11.0
coremltools==6.3.0
tensorflow==2.8
tensorflowjs>=3.9.0
transformers==4.36.2
ultralytics==8.0.226
thop
coretex
pytesseract
easyocr
transformers
ultralytics

0 comments on commit 2c9437f

Please sign in to comment.