Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: consider images inside PDF made with onlyoffice #2637

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
154 changes: 132 additions & 22 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,19 +444,78 @@ def _get_ids_image(
if ancest is None:
ancest = []
lst: List[Union[str, List[str]]] = []
if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
DictionaryObject, obj[PG.RESOURCES]
):
return [] if self.inline_images is None else list(self.inline_images.keys())

x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for o in x_object:
if not isinstance(x_object[o], StreamObject):
continue
if x_object[o][IA.SUBTYPE] == "/Image":
lst.append(o if len(ancest) == 0 else ancest + [o])
else: # is a form with possible images inside
lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack))

if PG.ANNOTS in obj:
for annot in cast(DictionaryObject, obj[PG.ANNOTS]):
Comment on lines +448 to +449
Copy link
Collaborator

@pubpub-zz pubpub-zz Aug 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would more likey propose

Suggested change
if PG.ANNOTS in obj:
for annot in cast(DictionaryObject, obj[PG.ANNOTS]):
for annot_idx,annot in enumerate(cast(DictionaryObject, obj.get(PG.ANNOTS, {}))):

annot_idx should be use to name of the image instead of ["/T"]
Also I would prefer to parse annotation after processing the page original resources in order to keeps "direct" images and inlines before annots'.

if (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add:

annot = annot.get_object()

and simplify your code below (specially the .keys() should not be required
I agree here the wallrus operator would ease readability 😉

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

walrus can now be used as python 3.7 is over. if it helps you

"/AP" in cast(DictionaryObject, annot.keys())
and "/N" in cast(DictionaryObject, annot["/AP"].keys())
and PG.RESOURCES in annot["/AP"]["/N"].get_object()
and RES.XOBJECT
in cast(DictionaryObject, annot["/AP"]["/N"][PG.RESOURCES])
and "/FRM"
in cast(
DictionaryObject, annot["/AP"]["/N"][PG.RESOURCES][RES.XOBJECT]
)
):
frame = annot["/AP"]["/N"][PG.RESOURCES][RES.XOBJECT]["/FRM"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not understand the ["\FRM"]
I've produced a test with a stamp and the image is called ["\Icon"]:
tttt.pdf


Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The best would be to call recursively _get_image_ids()

if PG.RESOURCES in frame.get_object() and RES.XOBJECT in cast(
DictionaryObject, frame[PG.RESOURCES]
):
x_object = frame[PG.RESOURCES][RES.XOBJECT]
for o in x_object:
if (
isinstance(x_object[o], StreamObject)
and x_object[o][IA.SUBTYPE] == "/Image"
):
lst.extend(
[
(
f"{PG.ANNOTS}/{annot['/T']}{o}"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see comment about naming

if len(ancest) == 0
else ancest + [o]
)
]
)

if PG.RESOURCES in obj:
if RES.PATTERN in cast(DictionaryObject, obj[PG.RESOURCES]):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rather adding the pattern images after the other xobjects to minimimize impact on the order

for pattern_name, pattern in cast(
DictionaryObject,
cast(DictionaryObject, obj[PG.RESOURCES])[RES.PATTERN],
).items():
if PG.RESOURCES in pattern.get_object() and RES.XOBJECT in cast(
DictionaryObject, pattern[PG.RESOURCES]
):
x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object()
for o in x_object:
if (
isinstance(x_object[o], StreamObject)
and x_object[o][IA.SUBTYPE] == "/Image"
):
lst.extend(
[
(
f"{RES.PATTERN}{pattern_name}{o}"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for the name as proposed above I woud come to
reader.pages[0].images['Pattern’,'/Img']

if len(ancest) == 0
else ancest + [o]
)
]
)

if RES.XOBJECT in cast(DictionaryObject, obj[PG.RESOURCES]):
x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for o in x_object:
if not isinstance(x_object[o], StreamObject):
continue
if x_object[o][IA.SUBTYPE] == "/Image":
lst.append(o if len(ancest) == 0 else ancest + [o])
else: # is a form with possible images inside
lst.extend(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is an an exemple of recursive call

self._get_ids_image(x_object[o], ancest + [o], call_stack)
)

assert self.inline_images is not None
lst.extend(list(self.inline_images.keys()))
return lst
Expand All @@ -473,9 +532,50 @@ def _get_image(
if isinstance(id, List) and len(id) == 1:
id = id[0]
try:
xobjs = cast(
DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
)
if isinstance(id, str) and id.find(RES.PATTERN) == 0:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with my naming proposal you may simplify the following section to make xobjs to point the proper node above the images this would prevent changing the code at new line 589

pattern_name = id[len(RES.PATTERN) : id.find("/", len(RES.PATTERN) + 1)]

patterns = cast(
DictionaryObject,
cast(DictionaryObject, obj[PG.RESOURCES])[RES.PATTERN],
)

xobjs = cast(
DictionaryObject,
cast(
DictionaryObject,
cast(DictionaryObject, patterns[pattern_name])[PG.RESOURCES],
)[RES.XOBJECT],
)
elif isinstance(id, str) and id.find(PG.ANNOTS) == 0:
annot_name = id[len(RES.PATTERN) : id.find("/", len(RES.PATTERN) + 1)]
annots = cast(DictionaryObject, obj[PG.ANNOTS])

for temp_annot in annots:
if temp_annot["/T"] == annot_name:
annot = temp_annot
break

frame_xobjs = cast(
DictionaryObject,
cast(
DictionaryObject,
cast(DictionaryObject, annot["/AP"]["/N"])[PG.RESOURCES],
)[RES.XOBJECT],
)

xobjs = cast(
DictionaryObject,
cast(
DictionaryObject,
cast(DictionaryObject, frame_xobjs["/FRM"])[PG.RESOURCES],
)[RES.XOBJECT],
)
else:
xobjs = cast(
DictionaryObject,
cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT],
)
except KeyError:
if not (id[0] == "~" and id[-1] == "~"):
raise
Expand All @@ -487,15 +587,25 @@ def _get_image(
raise KeyError("no inline image can be found")
return self.inline_images[id]

imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
extension, byte_stream = imgd[:2]
f = ImageFile(
name=f"{id[1:]}{extension}",
if id.find("/Pattern") == 0:
image_identifier = id[id.rfind("/") :]
image_name = pattern_name[1:] + "_" + image_identifier[1:]
elif id.find("/Annot") == 0:
image_identifier = id[id.rfind("/") :]
image_name = annot_name + "_" + image_identifier[1:]
else:
image_identifier = str(id)
image_name = id[1:]

imgd = _xobj_to_image(cast(DictionaryObject, xobjs[image_identifier]))
image_extension, byte_stream = imgd[:2]

return ImageFile(
name=image_name + str(image_extension),
Comment on lines +590 to +604
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see above changing this section

data=byte_stream,
image=imgd[2],
indirect_reference=xobjs[id].indirect_reference,
indirect_reference=xobjs[image_identifier].indirect_reference,
)
return f
else: # in a sub object
ids = id[1:]
return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))
Expand Down
2 changes: 1 addition & 1 deletion sample-files
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not recommend adding files to sample-files : your examples are properly stored in the threads this is sufficient

6 changes: 4 additions & 2 deletions tests/example_files.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,5 +112,7 @@
url: https://github.com/py-pdf/pypdf/files/12050253/tt.pdf
- local_filename: Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf
url: https://www.joinville.sc.gov.br/wp-content/uploads/2023/11/Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf
- local_filename: iss2138.pdf
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why these lines dissaperared?

url: https://github.com/py-pdf/pypdf/files/12483807/AEO.1172.pdf
- local_filename: iss2613-onlyoffice-standardImages.pdf
url: https://github.com/py-pdf/pypdf/files/15355445/iss2613-onlyoffice-standardImages.pdf
- local_filename: iss2613-onlyoffice-form.pdf
url: https://github.com/py-pdf/pypdf/files/15355444/iss2613-onlyoffice-form.pdf
60 changes: 60 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,66 @@ def test_image_extraction(src, page_index, image_key, expected):
assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99


@pytest.mark.enable_socket()
def test_onlyoffice_standard_images_extraction():
reader = PdfReader(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a reference to the issue numbers would be apreciated to ease later analysis

BytesIO(get_data_from_url(name="iss2613-onlyoffice-standardImages.pdf"))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as you are having url/name defined locally I recommend to do the same for the pdf and not change the yaml file above

)

assert (
str(reader.pages[0].images)
== "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1, Image_2=/Pattern/P3/X1]"
)
Comment on lines +224 to +226
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
str(reader.pages[0].images)
== "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1, Image_2=/Pattern/P3/X1]"
)
reader.pages[0].images
== ["Image_0":"/Pattern/P1/X1", "Image_1":"/Pattern/P2/X1", "Image_2"="/Pattern/P3/X1"]
)


url = "https://github.com/py-pdf/pypdf/assets/67143274/cc28b39b-2e96-4bd3-b33c-c545c5cec2d9"
name = "iss2613-P1_X1.jpg"
P1_X1 = Image.open(BytesIO(get_data_from_url(url, name=name)))

assert image_similarity(reader.pages[0].images[0].image, P1_X1) >= 0.99

url = "https://github.com/py-pdf/pypdf/assets/67143274/827c9066-546a-4502-a613-579ec25c598e"
name = "iss2613-P2_X1.jpg"
P2_X1 = Image.open(BytesIO(get_data_from_url(url, name=name)))

assert image_similarity(reader.pages[0].images[1].image, P2_X1) >= 0.99

url = "https://github.com/py-pdf/pypdf/assets/67143274/df9cb9e9-e589-4d2e-a537-ae0fe3240bbd"
name = "iss2613-P3_X1.jpg"
P3_X1 = Image.open(BytesIO(get_data_from_url(url, name=name)))

assert image_similarity(reader.pages[0].images[2].image, P3_X1) >= 0.99


@pytest.mark.enable_socket()
def test_onlyoffice_form_images_extraction():
reader = PdfReader(BytesIO(get_data_from_url(name="iss2613-onlyoffice-form.pdf")))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see above


assert (
str(reader.pages[0].images)
== "[Image_0=/Annots/Image2_af_image/Img, Image_1=/Annots/Image3_af_image/Img]"
)

assert str(reader.pages[1].images) == "[Image_0=/Annots/Image4_af_image/Img]"

url = "https://github.com/py-pdf/pypdf/assets/67143274/cc28b39b-2e96-4bd3-b33c-c545c5cec2d9"
name = "iss2613-P1_X1.jpg"
P1_X1 = Image.open(BytesIO(get_data_from_url(url, name=name)))

assert image_similarity(reader.pages[0].images[0].image, P1_X1) >= 0.99

url = "https://github.com/py-pdf/pypdf/assets/67143274/827c9066-546a-4502-a613-579ec25c598e"
name = "iss2613-P2_X1.jpg"
P2_X1 = Image.open(BytesIO(get_data_from_url(url, name=name)))

assert image_similarity(reader.pages[0].images[1].image, P2_X1) >= 0.99

url = "https://github.com/py-pdf/pypdf/assets/67143274/df9cb9e9-e589-4d2e-a537-ae0fe3240bbd"
name = "iss2613-P3_X1.jpg"
P3_X1 = Image.open(BytesIO(get_data_from_url(url, name=name)))

assert image_similarity(reader.pages[1].images[0].image, P3_X1) >= 0.99


@pytest.mark.enable_socket()
@pytest.mark.timeout(30)
def test_loop_in_image_keys():
Expand Down
Loading