Skip to content

Commit

Permalink
Removes EasyOCR dependency from mdconvert. (microsoft#3573)
Browse files Browse the repository at this point in the history
* Removes EasyOCR dependency from mdconvert.

* Update mdconvert.py

---------

Co-authored-by: Jack Gerrits <[email protected]>
  • Loading branch information
afourney and jackgerrits authored Sep 26, 2024
1 parent f958f17 commit d2b750d
Show file tree
Hide file tree
Showing 3 changed files with 2 additions and 207 deletions.
38 changes: 2 additions & 36 deletions autogen/browser_utils/mdconvert.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,8 @@
# File-format detection
import puremagic
import requests
from binaryornot.check import is_binary
from bs4 import BeautifulSoup

# Optional OCR support
IS_OCR_CAPABLE = False
try:
import easyocr
import numpy as np
import PIL

IS_OCR_CAPABLE = True
except ModuleNotFoundError:
pass

# Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
try:
Expand Down Expand Up @@ -155,10 +143,9 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Guess the content type from any file extension that might be around
content_type, encoding = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))

# Only work with text
if content_type is None:
# No content type, so peek at the file and see if it's binary
if is_binary(local_path):
return None
return None
elif "text/" not in content_type.lower():
return None

Expand Down Expand Up @@ -725,8 +712,6 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
return None

ocr_min_confidence = kwargs.get("ocr_min_confidence", 0.25)

md_content = ""

# Add metadata
Expand Down Expand Up @@ -756,25 +741,6 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ "\n"
)

if IS_OCR_CAPABLE:
image = PIL.Image.open(local_path)
# Remove transparency
if image.mode in ("RGBA", "P"):
image = image.convert("RGB")

reader = easyocr.Reader(["en"]) # specify the language(s)
output = reader.readtext(np.array(image)) # local_path)
# The output is a list of tuples, each containing the coordinates of the text and the text itself.
# We join all the text pieces together to get the final text.
ocr_text = " "
for item in output:
if item[2] >= ocr_min_confidence:
ocr_text += item[1] + " "
ocr_text = ocr_text.strip()

if len(ocr_text) > 0:
md_content += "\n# Text detected by OCR:\n" + ocr_text

return DocumentConverterResult(
title=None,
text_content=md_content,
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@
"pathvalidate",
# for mdconvert
"puremagic", # File identification
"binaryornot", # More file identification
"pdfminer.six", # Pdf
"mammoth", # Docx
"python-pptx", # Ppts
Expand Down
170 changes: 0 additions & 170 deletions test/test_browser_utils.py

This file was deleted.

0 comments on commit d2b750d

Please sign in to comment.