diff --git a/CHANGELOG.md b/CHANGELOG.md index 0feb20b..1d584ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ ### New Features (ENH) - `pagemeta` now displays the name of a known page format that is close to the page dimensions +- `extract-images`: added optional `--output-dir` argument to specify the folder where the extracted images are stored ## Version 0.5.1, 2025-10-13 diff --git a/docs/user/subcommand-extract-images.md b/docs/user/subcommand-extract-images.md index 89bed3c..719fdd4 100644 --- a/docs/user/subcommand-extract-images.md +++ b/docs/user/subcommand-extract-images.md @@ -10,15 +10,15 @@ $ pdfly extract-images --help Extract images from PDF without resampling or altering. Adapted from work by Sylvain Pelissier - http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-res - ampling-in-python + http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ * pdf FILE [default: None] [required] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ +┌─ Arguments ───────────────────────────────────────────────────────────────────────────────────────┐ +│ * pdf FILE [required] │ +└───────────────────────────────────────────────────────────────────────────────────────────────────┘ +┌─ Options ─────────────────────────────────────────────────────────────────────────────────────────┐ +│ --output-dir -o DIRECTORY Output directory. Defaults to the input's directory. │ +│ --help Show this message and exit. │ +└───────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` @@ -27,10 +27,20 @@ $ pdfly extract-images --help Extract the first page of `document.pdf` and extract the images present in it. ``` -pdfly cat document.pdf 9 -o page.pdf +pdfly cat document.pdf 0 -o page.pdf -pdfly extract-text page.pdf +pdfly extract-images page.pdf Extracted 1 images: - - 0-Im0.png + - 0-Image0.png + +``` + +Extract the images of `document.pdf` in its directory's parent directory. + +``` +pdfly extract-images document.pdf -o .. + Extracted 1 images: + - /0-Image0.png + Stored in ``` diff --git a/pdfly/cli.py b/pdfly/cli.py index 548251e..8a732a8 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -214,8 +214,20 @@ def extract_images( resolve_path=True, ), ], + output_dir: Annotated[ + Path | None, + typer.Option( + "--output-dir", + "-o", + file_okay=False, + exists=True, + resolve_path=True, + writable=True, + help="Output directory. Defaults to the input's directory.", + ), + ] = None, ) -> None: - pdfly.extract_images.main(pdf) + pdfly.extract_images.main(pdf, output_dir) @entry_point.command(name="extract-text") # type: ignore[misc] diff --git a/pdfly/extract_images.py b/pdfly/extract_images.py index 6ce3208..b6be9c2 100644 --- a/pdfly/extract_images.py +++ b/pdfly/extract_images.py @@ -10,12 +10,16 @@ from pypdf import PdfReader -def main(pdf: Path) -> None: +def main(pdf: Path, output_dir: Path | None) -> None: reader = PdfReader(str(pdf)) + if not output_dir: + output_dir = Path("") extracted_images = [] for page_index, page0 in enumerate(reader.pages): for image_file_object in page0.images: - path = f"{page_index:04d}-{image_file_object.name}" + path = output_dir / Path( + f"{page_index:04d}-{image_file_object.name}" + ) with open(path, "wb") as fp: fp.write(image_file_object.data) extracted_images.append(path) @@ -26,3 +30,5 @@ def main(pdf: Path) -> None: print(f"Extracted {len(extracted_images)} images:") for path in extracted_images: print(f"- {path}") + if str(output_dir) != ".": + print(f"Stored in {output_dir}") diff --git a/tests/test_extract_images.py b/tests/test_extract_images.py index 346269d..f079d19 100644 --- a/tests/test_extract_images.py +++ b/tests/test_extract_images.py @@ -29,3 +29,22 @@ def test_extract_images_monochrome( captured = capsys.readouterr() assert not captured.err assert "Extracted 1 images" in captured.out + + +def test_extract_images_specific_output_dir( + capsys: pytest.CaptureFixture, + tmp_path: Path, +) -> None: + with chdir(tmp_path): + run_cli( + [ + "extract-images", + str(RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"), + "--output-dir", + str(tmp_path), + ] + ) + captured = capsys.readouterr() + assert not captured.err + assert "Extracted 3 images" in captured.out + assert f"Stored in {tmp_path}" in captured.out