-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: consider images inside PDF made with onlyoffice #2637
base: main
Are you sure you want to change the base?
Changes from all commits
10f3b85
557d77d
524ffc3
46adbe1
76b545f
6edfa2d
0e81eee
b6ba820
07752a1
a5b980b
a23b272
d524672
24218c8
e5585d7
88d2223
cc1600d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -444,19 +444,78 @@ def _get_ids_image( | |
if ancest is None: | ||
ancest = [] | ||
lst: List[Union[str, List[str]]] = [] | ||
if PG.RESOURCES not in obj or RES.XOBJECT not in cast( | ||
DictionaryObject, obj[PG.RESOURCES] | ||
): | ||
return [] if self.inline_images is None else list(self.inline_images.keys()) | ||
|
||
x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore | ||
for o in x_object: | ||
if not isinstance(x_object[o], StreamObject): | ||
continue | ||
if x_object[o][IA.SUBTYPE] == "/Image": | ||
lst.append(o if len(ancest) == 0 else ancest + [o]) | ||
else: # is a form with possible images inside | ||
lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack)) | ||
|
||
if PG.ANNOTS in obj: | ||
for annot in cast(DictionaryObject, obj[PG.ANNOTS]): | ||
if ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add:
and simplify your code below (specially the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. walrus can now be used as python 3.7 is over. if it helps you |
||
"/AP" in cast(DictionaryObject, annot.keys()) | ||
and "/N" in cast(DictionaryObject, annot["/AP"].keys()) | ||
and PG.RESOURCES in annot["/AP"]["/N"].get_object() | ||
and RES.XOBJECT | ||
in cast(DictionaryObject, annot["/AP"]["/N"][PG.RESOURCES]) | ||
and "/FRM" | ||
in cast( | ||
DictionaryObject, annot["/AP"]["/N"][PG.RESOURCES][RES.XOBJECT] | ||
) | ||
): | ||
frame = annot["/AP"]["/N"][PG.RESOURCES][RES.XOBJECT]["/FRM"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do not understand the |
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The best would be to call recursively |
||
if PG.RESOURCES in frame.get_object() and RES.XOBJECT in cast( | ||
DictionaryObject, frame[PG.RESOURCES] | ||
): | ||
x_object = frame[PG.RESOURCES][RES.XOBJECT] | ||
for o in x_object: | ||
if ( | ||
isinstance(x_object[o], StreamObject) | ||
and x_object[o][IA.SUBTYPE] == "/Image" | ||
): | ||
lst.extend( | ||
[ | ||
( | ||
f"{PG.ANNOTS}/{annot['/T']}{o}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see comment about naming |
||
if len(ancest) == 0 | ||
else ancest + [o] | ||
) | ||
] | ||
) | ||
|
||
if PG.RESOURCES in obj: | ||
if RES.PATTERN in cast(DictionaryObject, obj[PG.RESOURCES]): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would rather adding the pattern images after the other xobjects to minimimize impact on the order |
||
for pattern_name, pattern in cast( | ||
DictionaryObject, | ||
cast(DictionaryObject, obj[PG.RESOURCES])[RES.PATTERN], | ||
).items(): | ||
if PG.RESOURCES in pattern.get_object() and RES.XOBJECT in cast( | ||
DictionaryObject, pattern[PG.RESOURCES] | ||
): | ||
x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object() | ||
for o in x_object: | ||
if ( | ||
isinstance(x_object[o], StreamObject) | ||
and x_object[o][IA.SUBTYPE] == "/Image" | ||
): | ||
lst.extend( | ||
[ | ||
( | ||
f"{RES.PATTERN}{pattern_name}{o}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for the name as proposed above I woud come to |
||
if len(ancest) == 0 | ||
else ancest + [o] | ||
) | ||
] | ||
) | ||
|
||
if RES.XOBJECT in cast(DictionaryObject, obj[PG.RESOURCES]): | ||
x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore | ||
for o in x_object: | ||
if not isinstance(x_object[o], StreamObject): | ||
continue | ||
if x_object[o][IA.SUBTYPE] == "/Image": | ||
lst.append(o if len(ancest) == 0 else ancest + [o]) | ||
else: # is a form with possible images inside | ||
lst.extend( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is an an exemple of recursive call |
||
self._get_ids_image(x_object[o], ancest + [o], call_stack) | ||
) | ||
|
||
assert self.inline_images is not None | ||
lst.extend(list(self.inline_images.keys())) | ||
return lst | ||
|
@@ -473,9 +532,50 @@ def _get_image( | |
if isinstance(id, List) and len(id) == 1: | ||
id = id[0] | ||
try: | ||
xobjs = cast( | ||
DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] | ||
) | ||
if isinstance(id, str) and id.find(RES.PATTERN) == 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. with my naming proposal you may simplify the following section to make xobjs to point the proper node above the images this would prevent changing the code at new line 589 |
||
pattern_name = id[len(RES.PATTERN) : id.find("/", len(RES.PATTERN) + 1)] | ||
|
||
patterns = cast( | ||
DictionaryObject, | ||
cast(DictionaryObject, obj[PG.RESOURCES])[RES.PATTERN], | ||
) | ||
|
||
xobjs = cast( | ||
DictionaryObject, | ||
cast( | ||
DictionaryObject, | ||
cast(DictionaryObject, patterns[pattern_name])[PG.RESOURCES], | ||
)[RES.XOBJECT], | ||
) | ||
elif isinstance(id, str) and id.find(PG.ANNOTS) == 0: | ||
annot_name = id[len(RES.PATTERN) : id.find("/", len(RES.PATTERN) + 1)] | ||
annots = cast(DictionaryObject, obj[PG.ANNOTS]) | ||
|
||
for temp_annot in annots: | ||
if temp_annot["/T"] == annot_name: | ||
annot = temp_annot | ||
break | ||
|
||
frame_xobjs = cast( | ||
DictionaryObject, | ||
cast( | ||
DictionaryObject, | ||
cast(DictionaryObject, annot["/AP"]["/N"])[PG.RESOURCES], | ||
)[RES.XOBJECT], | ||
) | ||
|
||
xobjs = cast( | ||
DictionaryObject, | ||
cast( | ||
DictionaryObject, | ||
cast(DictionaryObject, frame_xobjs["/FRM"])[PG.RESOURCES], | ||
)[RES.XOBJECT], | ||
) | ||
else: | ||
xobjs = cast( | ||
DictionaryObject, | ||
cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT], | ||
) | ||
except KeyError: | ||
if not (id[0] == "~" and id[-1] == "~"): | ||
raise | ||
|
@@ -487,15 +587,25 @@ def _get_image( | |
raise KeyError("no inline image can be found") | ||
return self.inline_images[id] | ||
|
||
imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) | ||
extension, byte_stream = imgd[:2] | ||
f = ImageFile( | ||
name=f"{id[1:]}{extension}", | ||
if id.find("/Pattern") == 0: | ||
image_identifier = id[id.rfind("/") :] | ||
image_name = pattern_name[1:] + "_" + image_identifier[1:] | ||
elif id.find("/Annot") == 0: | ||
image_identifier = id[id.rfind("/") :] | ||
image_name = annot_name + "_" + image_identifier[1:] | ||
else: | ||
image_identifier = str(id) | ||
image_name = id[1:] | ||
|
||
imgd = _xobj_to_image(cast(DictionaryObject, xobjs[image_identifier])) | ||
image_extension, byte_stream = imgd[:2] | ||
|
||
return ImageFile( | ||
name=image_name + str(image_extension), | ||
Comment on lines
+590
to
+604
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see above changing this section |
||
data=byte_stream, | ||
image=imgd[2], | ||
indirect_reference=xobjs[id].indirect_reference, | ||
indirect_reference=xobjs[image_identifier].indirect_reference, | ||
) | ||
return f | ||
else: # in a sub object | ||
ids = id[1:] | ||
return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do not recommend adding files to sample-files : your examples are properly stored in the threads this is sufficient |
+ − | 027-onlyoffice-image/P1_X1.jpg | |
+ − | 027-onlyoffice-image/P2_X1.jpg | |
+ − | 027-onlyoffice-image/P3_X1.jpg | |
+ − | 027-onlyoffice-image/Patterns.pdf | |
+10 −0 | files.json |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -112,5 +112,7 @@ | |
url: https://github.com/py-pdf/pypdf/files/12050253/tt.pdf | ||
- local_filename: Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf | ||
url: https://www.joinville.sc.gov.br/wp-content/uploads/2023/11/Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf | ||
- local_filename: iss2138.pdf | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why these lines dissaperared? |
||
url: https://github.com/py-pdf/pypdf/files/12483807/AEO.1172.pdf | ||
- local_filename: iss2613-onlyoffice-standardImages.pdf | ||
url: https://github.com/py-pdf/pypdf/files/15355445/iss2613-onlyoffice-standardImages.pdf | ||
- local_filename: iss2613-onlyoffice-form.pdf | ||
url: https://github.com/py-pdf/pypdf/files/15355444/iss2613-onlyoffice-form.pdf |
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -214,6 +214,66 @@ def test_image_extraction(src, page_index, image_key, expected): | |||||||||||||
assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99 | ||||||||||||||
|
||||||||||||||
|
||||||||||||||
@pytest.mark.enable_socket() | ||||||||||||||
def test_onlyoffice_standard_images_extraction(): | ||||||||||||||
reader = PdfReader( | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a reference to the issue numbers would be apreciated to ease later analysis |
||||||||||||||
BytesIO(get_data_from_url(name="iss2613-onlyoffice-standardImages.pdf")) | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as you are having url/name defined locally I recommend to do the same for the pdf and not change the yaml file above |
||||||||||||||
) | ||||||||||||||
|
||||||||||||||
assert ( | ||||||||||||||
str(reader.pages[0].images) | ||||||||||||||
== "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1, Image_2=/Pattern/P3/X1]" | ||||||||||||||
) | ||||||||||||||
Comment on lines
+224
to
+226
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||
|
||||||||||||||
url = "https://github.com/py-pdf/pypdf/assets/67143274/cc28b39b-2e96-4bd3-b33c-c545c5cec2d9" | ||||||||||||||
name = "iss2613-P1_X1.jpg" | ||||||||||||||
P1_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) | ||||||||||||||
|
||||||||||||||
assert image_similarity(reader.pages[0].images[0].image, P1_X1) >= 0.99 | ||||||||||||||
|
||||||||||||||
url = "https://github.com/py-pdf/pypdf/assets/67143274/827c9066-546a-4502-a613-579ec25c598e" | ||||||||||||||
name = "iss2613-P2_X1.jpg" | ||||||||||||||
P2_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) | ||||||||||||||
|
||||||||||||||
assert image_similarity(reader.pages[0].images[1].image, P2_X1) >= 0.99 | ||||||||||||||
|
||||||||||||||
url = "https://github.com/py-pdf/pypdf/assets/67143274/df9cb9e9-e589-4d2e-a537-ae0fe3240bbd" | ||||||||||||||
name = "iss2613-P3_X1.jpg" | ||||||||||||||
P3_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) | ||||||||||||||
|
||||||||||||||
assert image_similarity(reader.pages[0].images[2].image, P3_X1) >= 0.99 | ||||||||||||||
|
||||||||||||||
|
||||||||||||||
@pytest.mark.enable_socket() | ||||||||||||||
def test_onlyoffice_form_images_extraction(): | ||||||||||||||
reader = PdfReader(BytesIO(get_data_from_url(name="iss2613-onlyoffice-form.pdf"))) | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see above |
||||||||||||||
|
||||||||||||||
assert ( | ||||||||||||||
str(reader.pages[0].images) | ||||||||||||||
== "[Image_0=/Annots/Image2_af_image/Img, Image_1=/Annots/Image3_af_image/Img]" | ||||||||||||||
) | ||||||||||||||
|
||||||||||||||
assert str(reader.pages[1].images) == "[Image_0=/Annots/Image4_af_image/Img]" | ||||||||||||||
|
||||||||||||||
url = "https://github.com/py-pdf/pypdf/assets/67143274/cc28b39b-2e96-4bd3-b33c-c545c5cec2d9" | ||||||||||||||
name = "iss2613-P1_X1.jpg" | ||||||||||||||
P1_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) | ||||||||||||||
|
||||||||||||||
assert image_similarity(reader.pages[0].images[0].image, P1_X1) >= 0.99 | ||||||||||||||
|
||||||||||||||
url = "https://github.com/py-pdf/pypdf/assets/67143274/827c9066-546a-4502-a613-579ec25c598e" | ||||||||||||||
name = "iss2613-P2_X1.jpg" | ||||||||||||||
P2_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) | ||||||||||||||
|
||||||||||||||
assert image_similarity(reader.pages[0].images[1].image, P2_X1) >= 0.99 | ||||||||||||||
|
||||||||||||||
url = "https://github.com/py-pdf/pypdf/assets/67143274/df9cb9e9-e589-4d2e-a537-ae0fe3240bbd" | ||||||||||||||
name = "iss2613-P3_X1.jpg" | ||||||||||||||
P3_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) | ||||||||||||||
|
||||||||||||||
assert image_similarity(reader.pages[1].images[0].image, P3_X1) >= 0.99 | ||||||||||||||
|
||||||||||||||
|
||||||||||||||
@pytest.mark.enable_socket() | ||||||||||||||
@pytest.mark.timeout(30) | ||||||||||||||
def test_loop_in_image_keys(): | ||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would more likey propose
annot_idx should be use to name of the image instead of
["/T"]
Also I would prefer to parse annotation after processing the page original resources in order to keeps "direct" images and inlines before annots'.