From c86d88613ea06feb5ecf6ac10d05b5814d6be70f Mon Sep 17 00:00:00 2001 From: IgorSusmelj Date: Sat, 14 Dec 2024 12:21:32 +0100 Subject: [PATCH 1/5] Add methods for fast image size estimation --- src/labelformat/utils.py | 121 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 119 insertions(+), 2 deletions(-) diff --git a/src/labelformat/utils.py b/src/labelformat/utils.py index fd5d757..04b6ca9 100644 --- a/src/labelformat/utils.py +++ b/src/labelformat/utils.py @@ -21,6 +21,124 @@ } +class ImageDimensionError(Exception): + """Raised when unable to extract image dimensions using fast methods.""" + + pass + + +def get_jpeg_dimensions(file_path: Path) -> tuple[int, int]: + """Try to efficiently get JPEG dimensions from file headers without decoding the image. + + This method reads only the JPEG file headers looking for the Start Of Frame (SOFn) + marker which contains the dimensions. This is much faster than decoding the entire + image as it: + - Only reads the file headers (typically a few KB) instead of the entire file + - Doesn't perform any image decompression + - Doesn't load the pixel data into memory + + This works for most standard JPEG files (including progressive JPEGs) but may fail + for some unusual formats or corrupted files. In those cases, an ImageDimensionError + is raised and a full image decode may be needed as fallback. + + Args: + file_path: Path to the JPEG file + + Returns: + Tuple of (width, height) + + Raises: + ImageDimensionError: If dimensions cannot be extracted from headers + """ + try: + with open(file_path, "rb") as img_file: + # Skip SOI marker + img_file.seek(2) + while True: + marker = img_file.read(2) + if len(marker) < 2: + raise ImageDimensionError("Invalid JPEG format") + # Find SOFn marker + if 0xFF == marker[0] and marker[1] in range(0xC0, 0xCF): + # Skip marker length + img_file.seek(3, 1) + h = int.from_bytes(img_file.read(2), "big") + w = int.from_bytes(img_file.read(2), "big") + return w, h + # Skip to next marker + length = int.from_bytes(img_file.read(2), "big") + img_file.seek(length - 2, 1) + except Exception as e: + raise ImageDimensionError(f"Failed to read JPEG dimensions: {str(e)}") + + +def get_png_dimensions(file_path: Path) -> tuple[int, int]: + """Try to efficiently get PNG dimensions from file headers without decoding the image. + + This method reads only the PNG IHDR (Image Header) chunk which is always the first + chunk after the PNG signature. This is much faster than decoding the entire image as it: + - Only reads the first ~30 bytes of the file + - Doesn't perform any image decompression + - Doesn't load the pixel data into memory + + This works for all valid PNG files since the IHDR chunk is mandatory and must appear + first according to the PNG specification. However, it may fail for corrupted files + or files that don't follow the PNG spec. In those cases, an ImageDimensionError is + raised and a full image decode may be needed as fallback. + + Args: + file_path: Path to the PNG file + + Returns: + Tuple of (width, height) + + Raises: + ImageDimensionError: If dimensions cannot be extracted from headers + """ + try: + with open(file_path, "rb") as img_file: + # Skip PNG signature + img_file.seek(8) + # Read IHDR chunk + chunk_length = int.from_bytes(img_file.read(4), "big") + chunk_type = img_file.read(4) + if chunk_type == b"IHDR": + w = int.from_bytes(img_file.read(4), "big") + h = int.from_bytes(img_file.read(4), "big") + return w, h + raise ImageDimensionError("Invalid PNG format") + except Exception as e: + raise ImageDimensionError(f"Failed to read PNG dimensions: {str(e)}") + + +def get_image_dimensions(image_path: Path) -> tuple[int, int]: + """Get image dimensions using the most efficient method available. + + Args: + image_path: Path to the image file + + Returns: + Tuple of (width, height) + + Raises: + Exception: If image dimensions cannot be extracted using any method + """ + suffix = image_path.suffix.lower() + if suffix in {".jpg", ".jpeg"}: + try: + return get_jpeg_dimensions(image_path) + except ImageDimensionError: + pass + elif suffix == ".png": + try: + return get_png_dimensions(image_path) + except ImageDimensionError: + pass + + with PIL.Image.open(image_path) as img: + return img.size + + def get_images_from_folder(folder: Path) -> Iterable[Image]: """Yields an Image structure for all images in the given folder. @@ -36,8 +154,7 @@ def get_images_from_folder(folder: Path) -> Iterable[Image]: logger.debug(f"Skipping non-image file '{image_path}'") continue image_filename = str(image_path.relative_to(folder)) - with PIL.Image.open(image_path) as img: - image_width, image_height = img.size + image_width, image_height = get_image_dimensions(image_path) yield Image( id=image_id, filename=image_filename, From 9111bca559ee03926c40fda4eddcb12709617af1 Mon Sep 17 00:00:00 2001 From: IgorSusmelj Date: Sat, 14 Dec 2024 12:21:44 +0100 Subject: [PATCH 2/5] Add new tests for image size estimation --- tests/fixtures/image_file_loading/0001.png | Bin 0 -> 7006 bytes tests/unit/test_utils.py | 72 +++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 tests/fixtures/image_file_loading/0001.png create mode 100644 tests/unit/test_utils.py diff --git a/tests/fixtures/image_file_loading/0001.png b/tests/fixtures/image_file_loading/0001.png new file mode 100644 index 0000000000000000000000000000000000000000..08c375ab777b4f9d12052cb7e35b01e2ad9acd08 GIT binary patch literal 7006 zcmV-k8=>ThP)!4&|)8vw&75W^-6zyJWkA^^b`0KpId!z2g8BL)8X`2YO-{Pgqh%*f@f zr}yUM^V!z$(9HVn>&H1J-II*>j*hvA=@%~M6oOg+g$GS+!- z&|gx=J1Nw0XY$t4;-sGIzq;Coe$i)J=d`caeRIrFL&!ic&RR{;WLE`N8?XQX8iPqh zK~#90+Ig|*Y4QhaC%Ft^^e2z$Nj~8 z|Ja;o{hs&ktN9%`%Hz&FaJjn9e0+jjupYsI`6)PZ-+no#7+4m1)9G5F1{Co#TtHvPf|Gl zkXsimmS5%*26y}JoB1aAvo-n-K4Gx_1+DAP8tGUAL1>8COaJS&OEmk0>aqiDXx(O~ zAy^8@3D94iv5OXA13ocy9PUrV3P=7J3Etse-hF4y5gnW8SQA;{e3&!C!xg-#y?^6< zeFcO5X9FE;I{Io>Qm5Wt#4YCXO}F(WM6+cR*+KYz98|w+c-Q&2+M9RW)|U_*S55c? z;oX9xNrLVt|7FIm^;Z!lTC5xJNlcT^V^sH@2Z0!&*`K_tuVVZvA1kDZwXUa00#opv zPW$i3cL$5en?aRC8>Iqw{0%}h=U0)f$}=M!grl&ql!JiUQ6`PsYr8V0ASg=|wZS3P*Yh+Hob0K%)0LUv%z zYh8Wxb2s&McwTpJt=1SvJ~X6E5QufGHuIGGuDtJdR;wEh09OXqsAA@7#h z&+Da0PH7=yF%IW--Uel^MQ@3$D)rIun{QGP-uQtXS2f--kh7A}Co!M2%dKISFjMPS zjWp`NQHs3NUc@J_XrfzdNqw^BT7E1sgMHYQ-w5ATb@j%v?6j5nM29;m-#%8?f)YKj zgtr@*oD{vK3CJ7oAyk$fpySm6uOhW2HT1xN%CyQ z{2&?ccs9={ZHTvZnl!2^mp`59j`ay{AUle2@u_;7ZD?M`_4cZYP+^i(Jr@e`l z`6Sk{;Y=-hyB4EM;6iIZejXg`t5;=cf2g|HnQRbL%(4+3dsE>PieAPoz=q%%pHR#v z)c`wu(E^6~^i$ z<$pvspU>&wADtbE`2?!#*o05cU+};eB+LPtoaOTG6P_%F@pIXz-X>I+9Ye?V{P6NQ zk1yxi2kO)456fk%`$Ny-F9ZEd|8i$|Mp6y;RX$mtWuDW~Y9p)GjIM|j$w$Im(nZts zpJ-nmo_CFCqRIMfNtVPg>dYvUB3(A$?P=V#8av6)O2eOP>doaK*0ED*FRDY>zBYD= zs8m4dn$MAE7yPYZbR4Yb}a0sDJwbkfk)Kz4BWF;VXl*pM8F`>4AXDil>)&02?7&}6S*rOXP%9mK<r|-g>WAWTil}`VxC*X| zce?26eA0$^ud)C`&^61xNvDJekd!Sd!q6>kR71v-K<8>uR*Jmq9Vr(xlHB25E+>@X zR!~~N0J@n)6gqQ8XzSQ3kFG!y_`IMZn46Xxjp$4^4&;hWuV|9!SQ%UFoz5S}jCR?m z0??h?&0K8H>$JDPq7R(eqS`##x~J54 z-SGoW-bGOu^Th@GMS`{CLYFx#8XAy<;DVHDE1hS?(YJT8(QotmCQE#&UPsQYq%rinM@JcFCm zXA~l>OIE-_X*!lWnzTyk z4xI7^HJCB?1i5NBFzZ8e46Lf^)G<2E(aHL%zUJP!u1?@o@1;A#sudp7fe#sd(w&TE zrMDG)udKVAW}1v!>67%jS%!girqoK&k!2SoJtp6AJ~_##N>Q-lGc!-&I`A%KcY2#0 zM2=l_txkCX3z+P06Q(*!W`w#kDQiET+oA!>bjwLmt2O%QafP&s#_l1-1}^6zr(-)y zi}atEcHto&3YUW!y*gSdMFmiwE3HBp=WQY2e1^3E;J;FfcAeW<2rO*oZg)k`Wb zsE{l53S`mKG6%IqMyWrNdU>GDJ|>r40J6F+V3J!zQHuKf=)`lXtYus73!zF;JZP8& zC>@wBVjNv+1f(Vl=X{P&mc7adG4UT8AThI3w=6=smj#uJ%XSG9`+13?GO;I7<$9WZz z=nV5gZaTL`REom<(=0s9%BZrL?ox)->o0Xj$KrYiKTV%%>_9Z~S%rb%HW?hfuI`%; zU!F_5yZp-w9gY3ue(gE2zZo4^T$C?uIxvlBXz5>GNf9pHS*zGeZwdLO&)MHp*DX1< zQ-$eRYc`1P1)aJ;ED+|wib^fNTw;yg{h7R&dURdAEJcen2+8F=&2-n0_jksH zR(hkR*S&@;qZjjDX5ya|?)WIK^!jVd5-mmS74(*wYFApP0pSoApAYl(sSb{2x|7Ow zUetIM7aen3+yM`Y7cc3nk}oKo&pcEPz$_GYz}9ts<-KC0O$N|t^7bgk*2&&n`uXKaW!nAg)T9=NTmrDK^biif*Y#CwNHhGH09! zKh)QYM0;8YYMeZtEHQ6(MA=&4wD*eb-knULL$33;)49gis`tqQNJ9YPUb+v)AK6MT zhiCm|Szg9W|2&V2cof~CA~+#Ji!Aw5f{hl+9WdS8&fzMT1tiI?�+p_v0tb9`kjt zY>O()8*BV(V_4t{J-XhUjUDlh2!+^fNj;$=S&l71sHCAwlL~cC7J(RTwM1VGs2-s{ z1@XEQks^jLjRi|mWaw};*0wd?nQV1?(~u_QS)^86qyXA(G9!wO zk2V(*(FaL0vxIeGtxx&GxG~9P*#-QhOgf-xt!*qj?NZ@}IeC$WG#5Cx z&1x1u#lU}}6TBOzppz@%)NtNr#o=z3v{P*TN)z#?`;q$k5e-#UXrs$I)e!T&a6u-mYD#OHxg|G_^7Eyfp z#B>0cy|pLGbeGW!#d{dIlu9f^z1jRGinqym8iMZtNy>R!B?}e_h~J;uLVa@*F%k(#Vufk?BE1#*OZt-`+%XcgG8qW&YYYOHi;s@A=V_4N z<)|)HbtdFQP_LSCqAGyVYCXt%LA&&Xa&zNS*61>KG|9kJY*8`0AVwEgb7}^{YuRbO zpnrc-ImLWZ+rU?q^*t}I9SDX`K(~b&xz7^XO73qU!0H6b<^BV__g`FHMv21v^jMn3&7OZDJEdrs+H_ z(vAbKoRFpGKP!L5)yVA;ZJ6uV`d`=@)G@}0?wbfoZyTI%WcyfcsI9f7kbDZ>a z@MLei-0=YEHS{^Z#p1q>@oP4_m4u{e9eCrV1aI4w#H6v^dMQ)6!~W=hB|pi;!%Dj`LZ7@jrhb~jM)wfI=EY8WY5 z$}vh3q#B?cf{|D;9q8FoY8O{JoO3~;En>zRWTZQnY*FX-(iX`ec4VP^3m_;4avt}& z>cz0>$+F=NG+aH%hB(Y=cL=Mp6i+W(?PaW-f^dIzI?@|gb1F21Ub6qON#jEEM(VUBgesOuNYu+-Ot%qzRf>Y+UAXC{ z9%iLih);c1FNR15uwPvYu_jsCq{R{$1;fUqDEY&n&{$4$yA@lc7kk4LG3c+bY6~E$ zHl_^fYOon-L(ufj=PR|xVD^7(7oK0Zlm5W|(i&|b*DPXTxq$o&2p1RTwkIwi4loxXu zItHU!t8E`YTaq1sBHkuuNTc&{rjKLjpzHofOn!UGpBR;6et>S_S4qZi*7u&U|dT#Z38 z7U&T7mN{)%%7j^E7EmiKP41j7y7F=%knCTRm_OzgvT|wnjkpZ}zQEKW)CB>Aqq#;F<$;MW0$0tW+%gy)jrvg;hc6Y)WHr2!Y=?cd4Gx9Mp zT5J*)Vdf@FZ!w~IJ7i+%Q@45>t;uYhBR3nDfR_WQBzJ$tjbfuXmKIy_Y9R%x|19bBo+^za8z;rCeh-tNcKfQri zfzbaSA-Qt4@@5pVkClRvKeff=<`5@7E4ftf?5CNy0PD zH_+0o^zd;jF;5#ueKyWS&b_a`g@o_@;iM==G!HF^+NNO`#T7!Ce%|L9t(Z^nmOh+S zsajtyta`Wjc^I|ZB&z66K0t#7ZpeCaXwg$}oiau>yFf7*k1n6<-7W+uPw=EYX_@~M zTkd?&$Y;Vw>dq{Om!%)PrGtTFJ6i9}yWpMU%71$dKkU zncr%|N9n|d5e+U_E`=0_K(43#*NgaCNkaAv)b)w#f>{oH>Qdq$mSE35<*=qTLYDTX zDichh(@tetkOxZ2Z#dnl^GQq-=~81IJT9WBX<{vd6+W0#OxyAK1S~6_GV7KE018W< zJH2H2VT0vDE2+iiFThKY)IltCAe*TQs<;IcaZ^e?TA1tJu`Yd{dGCX3?RgDDw}AA* z>gNzhkxbe>ofji=Lt{ljwYE!JV^WKAJxeVU><*`PqveMU77^D$AVp8xky21gO}%-# zoU2lUC7}7e`t~yc4)X?lk}S#m%7R%Q{JC0u+T9y+oJqqeIxiCt-D2DX>^q9-qRVd> z@<%%*J|Q_dU_B$hTn}g z2Ac-U4`0FK!bMJs?r!WIRqlv(8B&g zE((kmyH|^dfyS*7ccFRtgM2~}a4|^Lk@rH;ow%ywYGn5N1)mu1$_Uqm3rg~!7nLQM z-v&WkR57-`V08z;N`=5^A+w0sSYJald1UW%9i*JN|1Jn@QMDVj$CX8ZKXMDMZyVa#H6)Q)x9aQsUale&NWAuJLVc?FCtJ!%X5y@YP zvwPe;oAj!s$}fW;N7wXVjg2)4E}^R{FE)q2<P zK6b|rES*!rZ)-{5&sOR=YT~o~J+hO-A3kh!DlK;5wvx13!+EG)-${J7^{@5WUSMT8 z=v1!M>z~CKztfw#>iulAq3!qE;yUzCSI{=gr_{8yG}`a>rVV<_>8a0uFHH!#Thkft zN^}9RJ-_f8v>9c=O2kYxBo`qbxNjY+Irw2#(IcFX-! zn!xbe8`|E5gC8a4LUcHo3X-*yXTA;bFGhFgA6kzu@`?Fx(Im!TwLR}$@zOgvKH2>< zvQxrowso=UAT8q)jLyS9)0Fe);a`!zQKn16C-o~`{{aMN!{^2iU(&>!{5Gj8{}cumCL26=A18i?Y{n;H?f&n76r<_1o$TPzr)+a6Pm})_pHOrrdkcVY zjKe9vd?L}Y|2IBK5sk5Kz~$w$Qk({MY>WSYe3D~uwv+Wv*FqUK<`eTLhMN8^bWYQ7 wGBh;AfVGeRJxwS&!Jv0s924~W|Gez{2ij!7&rP)j`Tzg`07*qoM6N<$f(Dql761SM literal 0 HcmV?d00001 diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py new file mode 100644 index 0000000..30c0913 --- /dev/null +++ b/tests/unit/test_utils.py @@ -0,0 +1,72 @@ +import pytest +from pathlib import Path + +from labelformat.utils import ( + get_jpeg_dimensions, + get_png_dimensions, + get_image_dimensions, + ImageDimensionError, +) + +FIXTURES_DIR = Path(__file__).parent.parent / "fixtures" + + +class TestImageDimensions: + def test_jpeg_dimensions_valid_file(self) -> None: + image_path = ( + FIXTURES_DIR / "instance_segmentation/YOLOv8/images/000000109005.jpg" + ) + width, height = get_jpeg_dimensions(image_path) + assert width == 640 + assert height == 428 + + def test_jpeg_dimensions_nonexistent_file(self) -> None: + with pytest.raises(ImageDimensionError): + get_jpeg_dimensions(Path("nonexistent.jpg")) + + def test_jpeg_dimensions_invalid_format(self) -> None: + yaml_file = FIXTURES_DIR / "object_detection/YOLOv8/example.yaml" + with pytest.raises(ImageDimensionError): + get_jpeg_dimensions(yaml_file) + + def test_png_dimensions_valid_file(self) -> None: + png_path = FIXTURES_DIR / "image_file_loading/0001.png" + width, height = get_png_dimensions(png_path) + assert width == 278 + assert height == 181 + + def test_png_dimensions_nonexistent_file(self) -> None: + with pytest.raises(ImageDimensionError): + get_png_dimensions(Path("nonexistent.png")) + + def test_png_dimensions_invalid_format(self) -> None: + yaml_file = FIXTURES_DIR / "object_detection/YOLOv8/example.yaml" + with pytest.raises(ImageDimensionError): + get_png_dimensions(yaml_file) + + def test_get_image_dimensions_jpeg_first_file(self) -> None: + jpeg_path = ( + FIXTURES_DIR / "instance_segmentation/YOLOv8/images/000000109005.jpg" + ) + width, height = get_image_dimensions(jpeg_path) + assert width == 640 + assert height == 428 + + def test_get_image_dimensions_jpeg_second_file(self) -> None: + jpeg_path = ( + FIXTURES_DIR / "instance_segmentation/YOLOv8/images/000000036086.jpg" + ) + width, height = get_image_dimensions(jpeg_path) + assert width == 482 + assert height == 640 + + def test_get_image_dimensions_png(self) -> None: + png_path = FIXTURES_DIR / "image_file_loading/0001.png" + width, height = get_image_dimensions(png_path) + assert width == 278 + assert height == 181 + + def test_get_image_dimensions_unsupported_format(self) -> None: + yaml_file = FIXTURES_DIR / "object_detection/YOLOv8/example.yaml" + with pytest.raises(Exception): + get_image_dimensions(yaml_file) From 4a9fda0cbed658bf0986a86263586f7fb09b63a0 Mon Sep 17 00:00:00 2001 From: IgorSusmelj Date: Sat, 14 Dec 2024 16:03:08 +0100 Subject: [PATCH 3/5] Use Python 3.7+ compatible type hints --- src/labelformat/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/labelformat/utils.py b/src/labelformat/utils.py index 04b6ca9..2eab1ca 100644 --- a/src/labelformat/utils.py +++ b/src/labelformat/utils.py @@ -1,6 +1,6 @@ import logging from pathlib import Path -from typing import Iterable +from typing import Iterable, Tuple import PIL.Image @@ -27,7 +27,7 @@ class ImageDimensionError(Exception): pass -def get_jpeg_dimensions(file_path: Path) -> tuple[int, int]: +def get_jpeg_dimensions(file_path: Path) -> Tuple[int, int]: """Try to efficiently get JPEG dimensions from file headers without decoding the image. This method reads only the JPEG file headers looking for the Start Of Frame (SOFn) @@ -72,7 +72,7 @@ def get_jpeg_dimensions(file_path: Path) -> tuple[int, int]: raise ImageDimensionError(f"Failed to read JPEG dimensions: {str(e)}") -def get_png_dimensions(file_path: Path) -> tuple[int, int]: +def get_png_dimensions(file_path: Path) -> Tuple[int, int]: """Try to efficiently get PNG dimensions from file headers without decoding the image. This method reads only the PNG IHDR (Image Header) chunk which is always the first @@ -111,7 +111,7 @@ def get_png_dimensions(file_path: Path) -> tuple[int, int]: raise ImageDimensionError(f"Failed to read PNG dimensions: {str(e)}") -def get_image_dimensions(image_path: Path) -> tuple[int, int]: +def get_image_dimensions(image_path: Path) -> Tuple[int, int]: """Get image dimensions using the most efficient method available. Args: From f1d5126a838b9b539990154807db335af399e304 Mon Sep 17 00:00:00 2001 From: IgorSusmelj Date: Sat, 14 Dec 2024 16:04:47 +0100 Subject: [PATCH 4/5] Format file --- tests/unit/test_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 30c0913..a2c2a0f 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,11 +1,12 @@ -import pytest from pathlib import Path +import pytest + from labelformat.utils import ( + ImageDimensionError, + get_image_dimensions, get_jpeg_dimensions, get_png_dimensions, - get_image_dimensions, - ImageDimensionError, ) FIXTURES_DIR = Path(__file__).parent.parent / "fixtures" From db36cc85cafac9bf106a989e86ca912ad0ced464 Mon Sep 17 00:00:00 2001 From: IgorSusmelj Date: Thu, 19 Dec 2024 22:34:21 +0100 Subject: [PATCH 5/5] Pin ubuntu version to 22 --- .github/workflows/run-tests.yml | 34 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 8d20dbf..6dba284 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -11,29 +11,29 @@ on: jobs: test: name: Tests - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 strategy: matrix: python: ["3.7", "3.10"] steps: - - name: Checkout code - uses: actions/checkout@v3 + - name: Checkout code + uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python }} + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} - - name: Install Poetry - uses: snok/install-poetry@v1 - with: - version: 1.4.2 + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + version: 1.4.2 - - name: Install the package and dependencies - run: | - poetry install + - name: Install the package and dependencies + run: | + poetry install - - name: Run tests - run: | - poetry run make all-checks + - name: Run tests + run: | + poetry run make all-checks