-
Notifications
You must be signed in to change notification settings - Fork 1
/
ocr.py
88 lines (75 loc) · 2.8 KB
/
ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import io
import spacy
import sys
from markdown import markdown
import pytesseract
from PIL import Image
from cnocr import CnOcr
from pypdf import PdfReader
from prompts import Prompt
from core import OCR_LANG, OCR_TYPE, brain
from PyQt6.QtWidgets import QApplication, QMainWindow, QTextBrowser
class OCR():
def __init__(self) -> None:
if OCR_LANG == "zh-CN":
self.spacy_model = "zh_core_web_sm"
self.rec_model = 'ch_PP-OCRv3'
if OCR_LANG == "en-US":
self.spacy_model = "en_core_web_sm"
self.rec_model = 'en_PP-OCRv3'
self.nlp = spacy.load(self.spacy_model)
self.img_file = None
self.ocr_type = OCR_TYPE
def ocr_pdf(self, pdf_file, page_number=None, scope=False):
reader = PdfReader(pdf_file)
outs = []
pages = reader.pages
if type(page_number) is int:
pages = [reader.pages[page_number]]
if type(page_number) is list:
if scope:
page_number = range(page_number[0],page_number[1])
for page in page_number:
for image_file_object in reader.pages[page].images:
self.img_file = Image.open(io.BytesIO(image_file_object.data))
outs.append(self.paddOCR())
return "".join(outs)
for page in pages:
for image_file_object in page.images:
self.img_file = Image.open(io.BytesIO(image_file_object.data))
outs.append(self.paddOCR())
return "".join(outs)
def ocr_img(self,img_file):
self.img_file = img_file
if self.ocr_type == "tesseract":
out = self.tessOCR()
if self.ocr_type == "paddle":
out = self.paddOCR(self.rec_model)
self.doc = self.nlp(out)
sentences = [sent.text for sent in self.doc.sents]
return sentences
def tessOCR(self, lang='eng+chi_sim'):
out = pytesseract.image_to_string(self.img_file, lang)
return out
def paddOCR(self, rec_model_name):
ocr = CnOcr(rec_model_name)
out = "".join([i['text'] for i in ocr.ocr(self.img_file)])
return out
if __name__ == "__main__":
rco = OCR()
res = "".join(rco.ocr_img('.latest.screenshot.png'))
conversation = []
conversation.append(Prompt.screenshot.value)
conversation.append({
"role": "user",
"content": res
})
result = brain(conversation)
app = QApplication(sys.argv)
window = QMainWindow()
text_browser = QTextBrowser(window)
window.setCentralWidget(text_browser)
window.setGeometry(QApplication.primaryScreen().size().width(), 0, 400, QApplication.primaryScreen().size().height())
text_browser.setText(markdown(result))
window.show()
sys.exit(app.exec())