-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_extract_aws.py
31 lines (24 loc) · 1.03 KB
/
pdf_extract_aws.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import fitz # PyMuPDF
import re
def extract_options_from_pdf(pdf_path, question_number):
# "rb" means read binary
with open(pdf_path, 'rb') as file:
doc = fitz.open(stream=file.read(), filetype="pdf")
options_pattern = re.compile(f'Question #{question_number}\n(.+?)\nA\. (.+?)\nB\. (.+?)\nC\. (.+?)\nD\. (.+?)\n', re.DOTALL)
extracted_options = []
for page_num in range(doc.page_count):
page = doc[page_num]
text = page.get_text() # Specify the encoding
decoded_text = text.encode('utf-8').decode('utf-8')
matches = options_pattern.search(text)
if matches:
question = matches.group(1)
extracted_options = [matches.group(i) for i in range(2, 6)]
doc.close()
return extracted_options
def write_txt(extracted_options, question_number):
with open("options.txt", "a", encoding="utf-8") as file:
file.write(f"{question_number}\n")
file.write("- ")
file.write("\n- ".join(extracted_options))
file.write("\n\n")