Removes EasyOCR dependency from mdconvert. (microsoft#3573)

* Removes EasyOCR dependency from mdconvert. * Update mdconvert.py --------- Co-authored-by: Jack Gerrits <[email protected]>
luxzoli · Sep 26, 2024 · d2b750d · d2b750d
1 parent f958f17
commit d2b750d
Show file tree

Hide file tree

Showing 3 changed files with 2 additions and 207 deletions.
diff --git a/autogen/browser_utils/mdconvert.py b/autogen/browser_utils/mdconvert.py
@@ -28,20 +28,8 @@
 # File-format detection
 import puremagic
 import requests
-from binaryornot.check import is_binary
 from bs4 import BeautifulSoup
 
-# Optional OCR support
-IS_OCR_CAPABLE = False
-try:
-    import easyocr
-    import numpy as np
-    import PIL
-
-    IS_OCR_CAPABLE = True
-except ModuleNotFoundError:
-    pass
-
 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 try:
@@ -155,10 +143,9 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Guess the content type from any file extension that might be around
         content_type, encoding = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))
 
+        # Only work with text
         if content_type is None:
-            # No content type, so peek at the file and see if it's binary
-            if is_binary(local_path):
-                return None
+            return None
         elif "text/" not in content_type.lower():
             return None
 
@@ -725,8 +712,6 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         if extension.lower() not in [".jpg", ".jpeg", ".png"]:
             return None
 
-        ocr_min_confidence = kwargs.get("ocr_min_confidence", 0.25)
-
         md_content = ""
 
         # Add metadata
@@ -756,25 +741,6 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
                 + "\n"
             )
 
-        if IS_OCR_CAPABLE:
-            image = PIL.Image.open(local_path)
-            # Remove transparency
-            if image.mode in ("RGBA", "P"):
-                image = image.convert("RGB")
-
-            reader = easyocr.Reader(["en"])  # specify the language(s)
-            output = reader.readtext(np.array(image))  # local_path)
-            # The output is a list of tuples, each containing the coordinates of the text and the text itself.
-            # We join all the text pieces together to get the final text.
-            ocr_text = " "
-            for item in output:
-                if item[2] >= ocr_min_confidence:
-                    ocr_text += item[1] + " "
-            ocr_text = ocr_text.strip()
-
-            if len(ocr_text) > 0:
-                md_content += "\n# Text detected by OCR:\n" + ocr_text
-
         return DocumentConverterResult(
             title=None,
             text_content=md_content,

diff --git a/setup.py b/setup.py
@@ -87,7 +87,6 @@
         "pathvalidate",
         # for mdconvert
         "puremagic",  # File identification
-        "binaryornot",  # More file identification
         "pdfminer.six",  # Pdf
         "mammoth",  # Docx
         "python-pptx",  # Ppts

diff --git a/test/test_browser_utils.py b/test/test_browser_utils.py