py-pdf · 0xNath · May 11, 2024 · May 12, 2024 · May 12, 2024 · May 12, 2024
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -444,19 +444,78 @@ def _get_ids_image(
         if ancest is None:
             ancest = []
         lst: List[Union[str, List[str]]] = []
-        if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
-            DictionaryObject, obj[PG.RESOURCES]
-        ):
-            return [] if self.inline_images is None else list(self.inline_images.keys())
-
-        x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object()  # type: ignore
-        for o in x_object:
-            if not isinstance(x_object[o], StreamObject):
-                continue
-            if x_object[o][IA.SUBTYPE] == "/Image":
-                lst.append(o if len(ancest) == 0 else ancest + [o])
-            else:  # is a form with possible images inside
-                lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack))
+
+        if PG.ANNOTS in obj:
+            for annot in cast(DictionaryObject, obj[PG.ANNOTS]):
-        if PG.ANNOTS in obj:
-            for annot in cast(DictionaryObject, obj[PG.ANNOTS]):
+        for annot_idx,annot in enumerate(cast(DictionaryObject, obj.get(PG.ANNOTS, {}))):
-        if PG.ANNOTS in obj:
-            for annot in cast(DictionaryObject, obj[PG.ANNOTS]):
+        for annot_idx,annot in enumerate(cast(DictionaryObject, obj.get(PG.ANNOTS, {}))):
+                if (
+                    "/AP" in cast(DictionaryObject, annot.keys())
+                    and "/N" in cast(DictionaryObject, annot["/AP"].keys())
+                    and PG.RESOURCES in annot["/AP"]["/N"].get_object()
+                    and RES.XOBJECT
+                    in cast(DictionaryObject, annot["/AP"]["/N"][PG.RESOURCES])
+                    and "/FRM"
+                    in cast(
+                        DictionaryObject, annot["/AP"]["/N"][PG.RESOURCES][RES.XOBJECT]
+                    )
+                ):
+                    frame = annot["/AP"]["/N"][PG.RESOURCES][RES.XOBJECT]["/FRM"]
+
+                    if PG.RESOURCES in frame.get_object() and RES.XOBJECT in cast(
+                        DictionaryObject, frame[PG.RESOURCES]
+                    ):
+                        x_object = frame[PG.RESOURCES][RES.XOBJECT]
+                        for o in x_object:
+                            if (
+                                isinstance(x_object[o], StreamObject)
+                                and x_object[o][IA.SUBTYPE] == "/Image"
+                            ):
+                                lst.extend(
+                                    [
+                                        (
+                                            f"{PG.ANNOTS}/{annot['/T']}{o}"
+                                            if len(ancest) == 0
+                                            else ancest + [o]
+                                        )
+                                    ]
+                                )
+
+        if PG.RESOURCES in obj:
+            if RES.PATTERN in cast(DictionaryObject, obj[PG.RESOURCES]):
+                for pattern_name, pattern in cast(
+                    DictionaryObject,
+                    cast(DictionaryObject, obj[PG.RESOURCES])[RES.PATTERN],
+                ).items():
+                    if PG.RESOURCES in pattern.get_object() and RES.XOBJECT in cast(
+                        DictionaryObject, pattern[PG.RESOURCES]
+                    ):
+                        x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object()
+                        for o in x_object:
+                            if (
+                                isinstance(x_object[o], StreamObject)
+                                and x_object[o][IA.SUBTYPE] == "/Image"
+                            ):
+                                lst.extend(
+                                    [
+                                        (
+                                            f"{RES.PATTERN}{pattern_name}{o}"
+                                            if len(ancest) == 0
+                                            else ancest + [o]
+                                        )
+                                    ]
+                                )
+
+            if RES.XOBJECT in cast(DictionaryObject, obj[PG.RESOURCES]):
+                x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object()  # type: ignore
+                for o in x_object:
+                    if not isinstance(x_object[o], StreamObject):
+                        continue
+                    if x_object[o][IA.SUBTYPE] == "/Image":
+                        lst.append(o if len(ancest) == 0 else ancest + [o])
+                    else:  # is a form with possible images inside
+                        lst.extend(
+                            self._get_ids_image(x_object[o], ancest + [o], call_stack)
+                        )
+
         assert self.inline_images is not None
         lst.extend(list(self.inline_images.keys()))
         return lst
@@ -473,9 +532,50 @@ def _get_image(
         if isinstance(id, List) and len(id) == 1:
             id = id[0]
         try:
-            xobjs = cast(
-                DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
-            )
+            if isinstance(id, str) and id.find(RES.PATTERN) == 0:
+                pattern_name = id[len(RES.PATTERN) : id.find("/", len(RES.PATTERN) + 1)]
+
+                patterns = cast(
+                    DictionaryObject,
+                    cast(DictionaryObject, obj[PG.RESOURCES])[RES.PATTERN],
+                )
+
+                xobjs = cast(
+                    DictionaryObject,
+                    cast(
+                        DictionaryObject,
+                        cast(DictionaryObject, patterns[pattern_name])[PG.RESOURCES],
+                    )[RES.XOBJECT],
+                )
+            elif isinstance(id, str) and id.find(PG.ANNOTS) == 0:
+                annot_name = id[len(RES.PATTERN) : id.find("/", len(RES.PATTERN) + 1)]
+                annots = cast(DictionaryObject, obj[PG.ANNOTS])
+
+                for temp_annot in annots:
+                    if temp_annot["/T"] == annot_name:
+                        annot = temp_annot
+                        break
+
+                frame_xobjs = cast(
+                    DictionaryObject,
+                    cast(
+                        DictionaryObject,
+                        cast(DictionaryObject, annot["/AP"]["/N"])[PG.RESOURCES],
+                    )[RES.XOBJECT],
+                )
+
+                xobjs = cast(
+                    DictionaryObject,
+                    cast(
+                        DictionaryObject,
+                        cast(DictionaryObject, frame_xobjs["/FRM"])[PG.RESOURCES],
+                    )[RES.XOBJECT],
+                )
+            else:
+                xobjs = cast(
+                    DictionaryObject,
+                    cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT],
+                )
         except KeyError:
             if not (id[0] == "~" and id[-1] == "~"):
                 raise
@@ -487,15 +587,25 @@ def _get_image(
                     raise KeyError("no inline image can be found")
                 return self.inline_images[id]
 
-            imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
-            extension, byte_stream = imgd[:2]
-            f = ImageFile(
-                name=f"{id[1:]}{extension}",
+            if id.find("/Pattern") == 0:
+                image_identifier = id[id.rfind("/") :]
+                image_name = pattern_name[1:] + "_" + image_identifier[1:]
+            elif id.find("/Annot") == 0:
+                image_identifier = id[id.rfind("/") :]
+                image_name = annot_name + "_" + image_identifier[1:]
+            else:
+                image_identifier = str(id)
+                image_name = id[1:]
+
+            imgd = _xobj_to_image(cast(DictionaryObject, xobjs[image_identifier]))
+            image_extension, byte_stream = imgd[:2]
+
+            return ImageFile(
+                name=image_name + str(image_extension),
                 data=byte_stream,
                 image=imgd[2],
-                indirect_reference=xobjs[id].indirect_reference,
+                indirect_reference=xobjs[image_identifier].indirect_reference,
             )
-            return f
         else:  # in a sub object
             ids = id[1:]
             return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))

diff --git a/sample-files b/sample-files
diff --git a/tests/example_files.yaml b/tests/example_files.yaml
@@ -112,5 +112,7 @@
   url: https://github.com/py-pdf/pypdf/files/12050253/tt.pdf
 - local_filename: Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf
   url: https://www.joinville.sc.gov.br/wp-content/uploads/2023/11/Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf
-- local_filename: iss2138.pdf
-  url: https://github.com/py-pdf/pypdf/files/12483807/AEO.1172.pdf
+- local_filename: iss2613-onlyoffice-standardImages.pdf
+  url: https://github.com/py-pdf/pypdf/files/15355445/iss2613-onlyoffice-standardImages.pdf
+- local_filename: iss2613-onlyoffice-form.pdf
+  url: https://github.com/py-pdf/pypdf/files/15355444/iss2613-onlyoffice-form.pdf
diff --git a/tests/test_images.py b/tests/test_images.py
@@ -214,6 +214,66 @@ def test_image_extraction(src, page_index, image_key, expected):
     assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99
 
 
+@pytest.mark.enable_socket()
+def test_onlyoffice_standard_images_extraction():
+    reader = PdfReader(
+        BytesIO(get_data_from_url(name="iss2613-onlyoffice-standardImages.pdf"))
+    )
+
+    assert (
+        str(reader.pages[0].images)
+        == "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1, Image_2=/Pattern/P3/X1]"
+    )
-        str(reader.pages[0].images)
-        == "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1, Image_2=/Pattern/P3/X1]"
-    )
+        reader.pages[0].images
+        == ["Image_0":"/Pattern/P1/X1", "Image_1":"/Pattern/P2/X1", "Image_2"="/Pattern/P3/X1"]
+    )
-        str(reader.pages[0].images)
-        == "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1, Image_2=/Pattern/P3/X1]"
-    )
+        reader.pages[0].images
+        == ["Image_0":"/Pattern/P1/X1", "Image_1":"/Pattern/P2/X1", "Image_2"="/Pattern/P3/X1"]
+    )
+
+    url = "https://github.com/py-pdf/pypdf/assets/67143274/cc28b39b-2e96-4bd3-b33c-c545c5cec2d9"
+    name = "iss2613-P1_X1.jpg"
+    P1_X1 = Image.open(BytesIO(get_data_from_url(url, name=name)))
+
+    assert image_similarity(reader.pages[0].images[0].image, P1_X1) >= 0.99
+
+    url = "https://github.com/py-pdf/pypdf/assets/67143274/827c9066-546a-4502-a613-579ec25c598e"
+    name = "iss2613-P2_X1.jpg"
+    P2_X1 = Image.open(BytesIO(get_data_from_url(url, name=name)))
+
+    assert image_similarity(reader.pages[0].images[1].image, P2_X1) >= 0.99
+
+    url = "https://github.com/py-pdf/pypdf/assets/67143274/df9cb9e9-e589-4d2e-a537-ae0fe3240bbd"
+    name = "iss2613-P3_X1.jpg"
+    P3_X1 = Image.open(BytesIO(get_data_from_url(url, name=name)))
+
+    assert image_similarity(reader.pages[0].images[2].image, P3_X1) >= 0.99
+
+
+@pytest.mark.enable_socket()
+def test_onlyoffice_form_images_extraction():
+    reader = PdfReader(BytesIO(get_data_from_url(name="iss2613-onlyoffice-form.pdf")))
+
+    assert (
+        str(reader.pages[0].images)
+        == "[Image_0=/Annots/Image2_af_image/Img, Image_1=/Annots/Image3_af_image/Img]"
+    )
+
+    assert str(reader.pages[1].images) == "[Image_0=/Annots/Image4_af_image/Img]"
+
+    url = "https://github.com/py-pdf/pypdf/assets/67143274/cc28b39b-2e96-4bd3-b33c-c545c5cec2d9"
+    name = "iss2613-P1_X1.jpg"
+    P1_X1 = Image.open(BytesIO(get_data_from_url(url, name=name)))
+
+    assert image_similarity(reader.pages[0].images[0].image, P1_X1) >= 0.99
+
+    url = "https://github.com/py-pdf/pypdf/assets/67143274/827c9066-546a-4502-a613-579ec25c598e"
+    name = "iss2613-P2_X1.jpg"
+    P2_X1 = Image.open(BytesIO(get_data_from_url(url, name=name)))
+
+    assert image_similarity(reader.pages[0].images[1].image, P2_X1) >= 0.99
+
+    url = "https://github.com/py-pdf/pypdf/assets/67143274/df9cb9e9-e589-4d2e-a537-ae0fe3240bbd"
+    name = "iss2613-P3_X1.jpg"
+    P3_X1 = Image.open(BytesIO(get_data_from_url(url, name=name)))
+
+    assert image_similarity(reader.pages[1].images[0].image, P3_X1) >= 0.99
+
+
 @pytest.mark.enable_socket()
 @pytest.mark.timeout(30)
 def test_loop_in_image_keys():