From 78bbef084efb7de59b8477f35bd4e00ca521fa24 Mon Sep 17 00:00:00 2001 From: Georgios Papametis Date: Fri, 16 Jan 2026 19:52:07 +0200 Subject: [PATCH 1/4] recovered lost work --- RECOVERED_FILE.tmp | Bin 0 -> 13216 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 RECOVERED_FILE.tmp diff --git a/RECOVERED_FILE.tmp b/RECOVERED_FILE.tmp new file mode 100644 index 0000000000000000000000000000000000000000..bd865075ffafd95c58db83018615bae53eb72602 GIT binary patch literal 13216 zcmeHO-EJF26duWUNW8;hKscm!61QoZh@zIJfugplnnnmkmFwTe)Nz9CHbkKhQE|-$ zH_cTqcnRRCfbToU$1}U$b=)?E2rMf*v$Hd2&i|a5Gx_V!y4!bq__f`@MXv5Hxw5;0 za@n=qqHDSuS}wZ=eiu=$qOFF%hFikhGTv9wuZ|IAS4EF9zBVwTiuS5oaqqkL+#Nva zxyN{}$qXy#RmZ3%{v!Ml{t{r8QDgt>?kRBD1ztTF@g;h){uns!yFSL;1MCC*?cvJ+ zt!(>DTH>wOFi#)&l+agUmH?&aK9_mc(K;An6#>EqzO+$(jrJ}|Qpo2juqcDJHQ;v< zEoH%#-{z&Zh$r#pAK~!#jk^UbBj8I&dw|$REsg#=p!b%$>mJD0ZBWq|;uWE+3w?yr zcMEvi1-^Cn1aL`DyhntVeY8CjybJC_!Ssro8)4S~7A@g=(G>xwj@}K7A(s{au@C4y z)cUB`(5jk2ZZy!p>3+Zry&+abz#?6gYablmLmzU0bT*`K1g%&FoF?8c0@{VddkLf1 zqk(4)kd|dUF)IpX3;1%jxXnXi^MG9gjb{z}n&U45?#rN*IuftX2kqAPW{jhhpYoVi z(G%`d!qoS5XeV3Su7f_NX;waR{Mp+_=t(VVq3;puljKGmi=tH>j3ET&{x)Q)vZc;W z4{gV70^0}B=uPPKmV1cNialW~_v5gsho|8<87l3~XpdZyr=gaHd^MgPf~V?j)S`Oe z7ir%E)g961$AC}&Ls@i0M)VV1Xntulub=-qI8X0HSvD}nOY%^lJV9Gc(*fvI4tJ%mYQcG?XgWGf?`Scv zawoapa<6hPb1!lcJUUmar_iA$I4}T@BJ|+5+4FpG;3W5Z&h(ZT{RF9f3MuJN{hhtB z+<-^B4d2504(yt-3u6lU1A552Vq;D5cROMiw7<608qyjNX=)(?7&LKPrS4!Q{aP?p;pm7+Z4@TA+ zD{&nzq5k&2nQuA2n)Ggb3*#?J{pVS{GK^)UP69? zd91KLKWtyZ*J#vx(z(CSRWRqqEKo_#i46zBk8sLdgy-2q$*(*!x9?VZTp5G0m1Xi2 z-@FYoAU*gQ=DuuRp3XCq_y``1`67Srsu;EKCBA$1SuEXes~_{(EW*~aZ9TtLFUKc$ z-G-x;aSB47)xjBJnuS+{(|GgMMvdWY(5(#NCeQ7&WYHSon#_x6Zej_uhxw7UpntaX zLOFTb({hU*XXY-MO(AthGE0~zWG0!sT7mX8p`Gz}<_Vdz;F)U^eJ;y*t`U|eoHP%y zwVdCie%?aL$oZBfTS?i3Z}X!!&F|S5Jr7Q1u_`g%QjWGZq?M{~``{8YIu+4wYW-EL z*gfbp`H}@=F6G<9VVhE~GltPZWw_q+^!hq1vkTi{R!Q?#JAo#|vArjKb}{Nl{Mos; z;ug+jqXv-o0kH2uiea={$eh7O=0(CG&KBY1Q)shMR&w;do2I8_RV(QK1k@dY=Nl+H z@H58sG|FryS|d;FnCD4KYtp!BB|ZtpA^1wKR7Fmanc>%&pYa>BzE{Dee<~83BoEW# zkY47Gua4;%IX{QLB5rbc9aQh7UO{E!?Y#tA!`lmS+vM#9&B7MI4`$Bw&P0gGo5Q*Q z978QO%hNfCsoAxa*MKeahK#}K3HBjlzFT&e*zzp6fG2a6lzTowVG;e6=DhGK!hH~~ zc502Z`ZiW$T0Qk}2T-WPb>ymu#cAGrGB?wGd>!-fhR`B>IemQiP9MJ_{=bGE4b+H5 zmXDuqc1kbJ^G`S0e0x#uCS8yKu+>>wWOCitlzqAgMv*nypAly{6pFgJ)R~xi-!CEWHlrh_4b^Gpe=f zliFz1sqNnrUW8N^#M}0u|7NY@_R5=1k?1#hoo0Vwv=d^G-rI0di56gQv@4UIjgRXV zqpY~6Q7Ah@E3;87TcQ^}s^vJlNkvXmml$R9UJ<`A+UO2rUEax}wHL&ejeCr>bW9(Z zDz)K^R=1gbBv=79deew&V$FRA*gO@mnGG(ZdOxZ^oFnwWHiwlQNsCZEw7|pT5tbpv zYoPNIxc3~od>qV?g;@!_P*30ZI6$ zSp@H99ts$%TuqiixAx7Vn>#vPMOB(NWAT9QhmIwNB8N}I zdJdYY-q|c{O=OTYyV;mgXO?u@Jg3VsNf-A-YE!f`TEn<+G#ROnWaLU;nq~)c?gNRx z^3aj|P^%_eR-DUQfUOY*GuP_3j#Fpyn5M-Z^llnqF94#}D+wvZJ#G3`@45i(jil@424V5euZ* z*!WI3dNYfCWTy%kpp8kweYW=LtK6kJlR*vs?{JPbLukw`Ih^*55AM+zy*a>eV!oDH( z3Tm79Jpfl359FQxFTxkH((;t%uKNmqH?R+=;Epy|lx<}wCcwB32 ajY`sy^a#eAIF5`G(^tGC*SXqFGuPi=>q?*i literal 0 HcmV?d00001 From 0db05d0062711958081de4c1e22b6b4f0f3acb82 Mon Sep 17 00:00:00 2001 From: Georgios Papametis Date: Fri, 16 Jan 2026 20:56:37 +0200 Subject: [PATCH 2/4] ENH: extract-images now has option for output folder --- CHANGELOG.md | 1 + RECOVERED_FILE.tmp | Bin 13216 -> 0 bytes docs/user/subcommand-extract-images.md | 34 ++++++++++++++++--------- pdfly/cli.py | 16 ++++++++++-- pdfly/extract_images.py | 8 ++++-- tests/test_extract_images.py | 17 +++++++++++++ 6 files changed, 60 insertions(+), 16 deletions(-) delete mode 100644 RECOVERED_FILE.tmp diff --git a/CHANGELOG.md b/CHANGELOG.md index 0feb20b..1d584ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ ### New Features (ENH) - `pagemeta` now displays the name of a known page format that is close to the page dimensions +- `extract-images`: added optional `--output-dir` argument to specify the folder where the extracted images are stored ## Version 0.5.1, 2025-10-13 diff --git a/RECOVERED_FILE.tmp b/RECOVERED_FILE.tmp deleted file mode 100644 index bd865075ffafd95c58db83018615bae53eb72602..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13216 zcmeHO-EJF26duWUNW8;hKscm!61QoZh@zIJfugplnnnmkmFwTe)Nz9CHbkKhQE|-$ zH_cTqcnRRCfbToU$1}U$b=)?E2rMf*v$Hd2&i|a5Gx_V!y4!bq__f`@MXv5Hxw5;0 za@n=qqHDSuS}wZ=eiu=$qOFF%hFikhGTv9wuZ|IAS4EF9zBVwTiuS5oaqqkL+#Nva zxyN{}$qXy#RmZ3%{v!Ml{t{r8QDgt>?kRBD1ztTF@g;h){uns!yFSL;1MCC*?cvJ+ zt!(>DTH>wOFi#)&l+agUmH?&aK9_mc(K;An6#>EqzO+$(jrJ}|Qpo2juqcDJHQ;v< zEoH%#-{z&Zh$r#pAK~!#jk^UbBj8I&dw|$REsg#=p!b%$>mJD0ZBWq|;uWE+3w?yr zcMEvi1-^Cn1aL`DyhntVeY8CjybJC_!Ssro8)4S~7A@g=(G>xwj@}K7A(s{au@C4y z)cUB`(5jk2ZZy!p>3+Zry&+abz#?6gYablmLmzU0bT*`K1g%&FoF?8c0@{VddkLf1 zqk(4)kd|dUF)IpX3;1%jxXnXi^MG9gjb{z}n&U45?#rN*IuftX2kqAPW{jhhpYoVi z(G%`d!qoS5XeV3Su7f_NX;waR{Mp+_=t(VVq3;puljKGmi=tH>j3ET&{x)Q)vZc;W z4{gV70^0}B=uPPKmV1cNialW~_v5gsho|8<87l3~XpdZyr=gaHd^MgPf~V?j)S`Oe z7ir%E)g961$AC}&Ls@i0M)VV1Xntulub=-qI8X0HSvD}nOY%^lJV9Gc(*fvI4tJ%mYQcG?XgWGf?`Scv zawoapa<6hPb1!lcJUUmar_iA$I4}T@BJ|+5+4FpG;3W5Z&h(ZT{RF9f3MuJN{hhtB z+<-^B4d2504(yt-3u6lU1A552Vq;D5cROMiw7<608qyjNX=)(?7&LKPrS4!Q{aP?p;pm7+Z4@TA+ zD{&nzq5k&2nQuA2n)Ggb3*#?J{pVS{GK^)UP69? zd91KLKWtyZ*J#vx(z(CSRWRqqEKo_#i46zBk8sLdgy-2q$*(*!x9?VZTp5G0m1Xi2 z-@FYoAU*gQ=DuuRp3XCq_y``1`67Srsu;EKCBA$1SuEXes~_{(EW*~aZ9TtLFUKc$ z-G-x;aSB47)xjBJnuS+{(|GgMMvdWY(5(#NCeQ7&WYHSon#_x6Zej_uhxw7UpntaX zLOFTb({hU*XXY-MO(AthGE0~zWG0!sT7mX8p`Gz}<_Vdz;F)U^eJ;y*t`U|eoHP%y zwVdCie%?aL$oZBfTS?i3Z}X!!&F|S5Jr7Q1u_`g%QjWGZq?M{~``{8YIu+4wYW-EL z*gfbp`H}@=F6G<9VVhE~GltPZWw_q+^!hq1vkTi{R!Q?#JAo#|vArjKb}{Nl{Mos; z;ug+jqXv-o0kH2uiea={$eh7O=0(CG&KBY1Q)shMR&w;do2I8_RV(QK1k@dY=Nl+H z@H58sG|FryS|d;FnCD4KYtp!BB|ZtpA^1wKR7Fmanc>%&pYa>BzE{Dee<~83BoEW# zkY47Gua4;%IX{QLB5rbc9aQh7UO{E!?Y#tA!`lmS+vM#9&B7MI4`$Bw&P0gGo5Q*Q z978QO%hNfCsoAxa*MKeahK#}K3HBjlzFT&e*zzp6fG2a6lzTowVG;e6=DhGK!hH~~ zc502Z`ZiW$T0Qk}2T-WPb>ymu#cAGrGB?wGd>!-fhR`B>IemQiP9MJ_{=bGE4b+H5 zmXDuqc1kbJ^G`S0e0x#uCS8yKu+>>wWOCitlzqAgMv*nypAly{6pFgJ)R~xi-!CEWHlrh_4b^Gpe=f zliFz1sqNnrUW8N^#M}0u|7NY@_R5=1k?1#hoo0Vwv=d^G-rI0di56gQv@4UIjgRXV zqpY~6Q7Ah@E3;87TcQ^}s^vJlNkvXmml$R9UJ<`A+UO2rUEax}wHL&ejeCr>bW9(Z zDz)K^R=1gbBv=79deew&V$FRA*gO@mnGG(ZdOxZ^oFnwWHiwlQNsCZEw7|pT5tbpv zYoPNIxc3~od>qV?g;@!_P*30ZI6$ zSp@H99ts$%TuqiixAx7Vn>#vPMOB(NWAT9QhmIwNB8N}I zdJdYY-q|c{O=OTYyV;mgXO?u@Jg3VsNf-A-YE!f`TEn<+G#ROnWaLU;nq~)c?gNRx z^3aj|P^%_eR-DUQfUOY*GuP_3j#Fpyn5M-Z^llnqF94#}D+wvZJ#G3`@45i(jil@424V5euZ* z*!WI3dNYfCWTy%kpp8kweYW=LtK6kJlR*vs?{JPbLukw`Ih^*55AM+zy*a>eV!oDH( z3Tm79Jpfl359FQxFTxkH((;t%uKNmqH?R+=;Epy|lx<}wCcwB32 ajY`sy^a#eAIF5`G(^tGC*SXqFGuPi=>q?*i diff --git a/docs/user/subcommand-extract-images.md b/docs/user/subcommand-extract-images.md index 89bed3c..b767b87 100644 --- a/docs/user/subcommand-extract-images.md +++ b/docs/user/subcommand-extract-images.md @@ -10,27 +10,37 @@ $ pdfly extract-images --help Extract images from PDF without resampling or altering. Adapted from work by Sylvain Pelissier - http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-res - ampling-in-python + http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ * pdf FILE [default: None] [required] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ +┌─ Arguments ───────────────────────────────────────────────────────────────────────────────────────┐ +│ * pdf FILE [required] │ +└───────────────────────────────────────────────────────────────────────────────────────────────────┘ +┌─ Options ─────────────────────────────────────────────────────────────────────────────────────────┐ +│ --output-dir -o DIRECTORY Output directory. Defaults to the input's directory. │ +│ --help Show this message and exit. │ +└───────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` + ``` ## Examples Extract the first page of `document.pdf` and extract the images present in it. ``` -pdfly cat document.pdf 9 -o page.pdf +pdfly cat document.pdf 0 -o page.pdf + +pdfly extract-images page.pdf + Extracted 1 images: + - 0-Image0.png -pdfly extract-text page.pdf +``` + +Extract the images of `document.pdf` in its directory's parent directory. + +``` +pdfly extract-images document.pdf -o .. Extracted 1 images: - - 0-Im0.png + - /0-Image0.png + Stored in ``` diff --git a/pdfly/cli.py b/pdfly/cli.py index 548251e..9245869 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -214,8 +214,20 @@ def extract_images( resolve_path=True, ), ], -) -> None: - pdfly.extract_images.main(pdf) + output_dir: Annotated[ + Path | None, + typer.Option( + "--output-dir", + "-o", + file_okay=False, + exists=True, + resolve_path=True, + writable=True, + help="Output directory. Defaults to the input's directory.", + ), + ] = None, + ) -> None: + pdfly.extract_images.main(pdf, output_dir) @entry_point.command(name="extract-text") # type: ignore[misc] diff --git a/pdfly/extract_images.py b/pdfly/extract_images.py index 6ce3208..774b2d7 100644 --- a/pdfly/extract_images.py +++ b/pdfly/extract_images.py @@ -10,12 +10,14 @@ from pypdf import PdfReader -def main(pdf: Path) -> None: +def main(pdf: Path, output_dir: Path | None) -> None: reader = PdfReader(str(pdf)) + if not output_dir: + output_dir = Path("") extracted_images = [] for page_index, page0 in enumerate(reader.pages): for image_file_object in page0.images: - path = f"{page_index:04d}-{image_file_object.name}" + path = output_dir / Path(f"{page_index:04d}-{image_file_object.name}") with open(path, "wb") as fp: fp.write(image_file_object.data) extracted_images.append(path) @@ -26,3 +28,5 @@ def main(pdf: Path) -> None: print(f"Extracted {len(extracted_images)} images:") for path in extracted_images: print(f"- {path}") + if str(output_dir)!= ".": + print(f"Stored in {output_dir}") diff --git a/tests/test_extract_images.py b/tests/test_extract_images.py index 346269d..d9f669c 100644 --- a/tests/test_extract_images.py +++ b/tests/test_extract_images.py @@ -29,3 +29,20 @@ def test_extract_images_monochrome( captured = capsys.readouterr() assert not captured.err assert "Extracted 1 images" in captured.out + +def test_extract_images_specific_output_dir( + capsys: pytest.CaptureFixture, tmp_path: Path, +) -> None: + with chdir(tmp_path): + run_cli( + [ + "extract-images", + str(RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"), + "--output-dir", + str(tmp_path) + ] + ) + captured = capsys.readouterr() + assert not captured.err + assert "Extracted 3 images" in captured.out + assert f"Stored in {tmp_path}" in captured.out From 6dad685ddeec001a1e014b2bd933ce9a4a1a7add Mon Sep 17 00:00:00 2001 From: Georgios Papametis Date: Fri, 16 Jan 2026 21:28:46 +0200 Subject: [PATCH 3/4] small fixes --- docs/user/subcommand-extract-images.md | 2 +- pdfly/cli.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user/subcommand-extract-images.md b/docs/user/subcommand-extract-images.md index b767b87..719fdd4 100644 --- a/docs/user/subcommand-extract-images.md +++ b/docs/user/subcommand-extract-images.md @@ -20,7 +20,7 @@ $ pdfly extract-images --help │ --help Show this message and exit. │ └───────────────────────────────────────────────────────────────────────────────────────────────────┘ - ``` +``` ## Examples diff --git a/pdfly/cli.py b/pdfly/cli.py index 9245869..8a732a8 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -226,7 +226,7 @@ def extract_images( help="Output directory. Defaults to the input's directory.", ), ] = None, - ) -> None: +) -> None: pdfly.extract_images.main(pdf, output_dir) From 9d11b306450b4343248b75c99ec43434668bf562 Mon Sep 17 00:00:00 2001 From: Georgios Papametis Date: Thu, 22 Jan 2026 21:49:53 +0200 Subject: [PATCH 4/4] black corrections --- pdfly/extract_images.py | 6 ++++-- tests/test_extract_images.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pdfly/extract_images.py b/pdfly/extract_images.py index 774b2d7..b6be9c2 100644 --- a/pdfly/extract_images.py +++ b/pdfly/extract_images.py @@ -17,7 +17,9 @@ def main(pdf: Path, output_dir: Path | None) -> None: extracted_images = [] for page_index, page0 in enumerate(reader.pages): for image_file_object in page0.images: - path = output_dir / Path(f"{page_index:04d}-{image_file_object.name}") + path = output_dir / Path( + f"{page_index:04d}-{image_file_object.name}" + ) with open(path, "wb") as fp: fp.write(image_file_object.data) extracted_images.append(path) @@ -28,5 +30,5 @@ def main(pdf: Path, output_dir: Path | None) -> None: print(f"Extracted {len(extracted_images)} images:") for path in extracted_images: print(f"- {path}") - if str(output_dir)!= ".": + if str(output_dir) != ".": print(f"Stored in {output_dir}") diff --git a/tests/test_extract_images.py b/tests/test_extract_images.py index d9f669c..f079d19 100644 --- a/tests/test_extract_images.py +++ b/tests/test_extract_images.py @@ -30,8 +30,10 @@ def test_extract_images_monochrome( assert not captured.err assert "Extracted 1 images" in captured.out + def test_extract_images_specific_output_dir( - capsys: pytest.CaptureFixture, tmp_path: Path, + capsys: pytest.CaptureFixture, + tmp_path: Path, ) -> None: with chdir(tmp_path): run_cli( @@ -39,7 +41,7 @@ def test_extract_images_specific_output_dir( "extract-images", str(RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"), "--output-dir", - str(tmp_path) + str(tmp_path), ] ) captured = capsys.readouterr()