Skip to content
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

### New Features (ENH)
- `pagemeta` now displays the name of a known page format that is close to the page dimensions
- `extract-images`: added optional `--output-dir` argument to specify the folder where the extracted images are stored


## Version 0.5.1, 2025-10-13
Expand Down
32 changes: 21 additions & 11 deletions docs/user/subcommand-extract-images.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ $ pdfly extract-images --help
Extract images from PDF without resampling or altering.

Adapted from work by Sylvain Pelissier
http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-res
ampling-in-python
http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python

╭─ Arguments ──────────────────────────────────────────────────────────────────╮
│ * pdf FILE [default: None] [required] │
╰──────────────────────────────────────────────────────────────────────────────╯
╭─ Options ────────────────────────────────────────────────────────────────────╮
│ --help Show this message and exit. │
╰──────────────────────────────────────────────────────────────────────────────╯
┌─ Arguments ───────────────────────────────────────────────────────────────────────────────────────┐
│ * pdf FILE [required] │
└───────────────────────────────────────────────────────────────────────────────────────────────────┘
┌─ Options ─────────────────────────────────────────────────────────────────────────────────────────┐
│ --output-dir -o DIRECTORY Output directory. Defaults to the input's directory. │
│ --help Show this message and exit. │
└───────────────────────────────────────────────────────────────────────────────────────────────────┘

```

Expand All @@ -27,10 +27,20 @@ $ pdfly extract-images --help
Extract the first page of `document.pdf` and extract the images present in it.

```
pdfly cat document.pdf 9 -o page.pdf
pdfly cat document.pdf 0 -o page.pdf

pdfly extract-text page.pdf
pdfly extract-images page.pdf
Extracted 1 images:
- 0-Im0.png
- 0-Image0.png

```

Extract the images of `document.pdf` in its directory's parent directory.

```
pdfly extract-images document.pdf -o ..
Extracted 1 images:
- <parent_directory>/0-Image0.png
Stored in <parent_directory>

```
14 changes: 13 additions & 1 deletion pdfly/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,20 @@ def extract_images(
resolve_path=True,
),
],
output_dir: Annotated[
Path | None,
typer.Option(
"--output-dir",
"-o",
file_okay=False,
exists=True,
resolve_path=True,
writable=True,
help="Output directory. Defaults to the input's directory.",
),
] = None,
) -> None:
pdfly.extract_images.main(pdf)
pdfly.extract_images.main(pdf, output_dir)


@entry_point.command(name="extract-text") # type: ignore[misc]
Expand Down
10 changes: 8 additions & 2 deletions pdfly/extract_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,16 @@
from pypdf import PdfReader


def main(pdf: Path) -> None:
def main(pdf: Path, output_dir: Path | None) -> None:
reader = PdfReader(str(pdf))
if not output_dir:
output_dir = Path("")
extracted_images = []
for page_index, page0 in enumerate(reader.pages):
for image_file_object in page0.images:
path = f"{page_index:04d}-{image_file_object.name}"
path = output_dir / Path(
f"{page_index:04d}-{image_file_object.name}"
)
with open(path, "wb") as fp:
fp.write(image_file_object.data)
extracted_images.append(path)
Expand All @@ -26,3 +30,5 @@ def main(pdf: Path) -> None:
print(f"Extracted {len(extracted_images)} images:")
for path in extracted_images:
print(f"- {path}")
if str(output_dir) != ".":
print(f"Stored in {output_dir}")
19 changes: 19 additions & 0 deletions tests/test_extract_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,22 @@ def test_extract_images_monochrome(
captured = capsys.readouterr()
assert not captured.err
assert "Extracted 1 images" in captured.out


def test_extract_images_specific_output_dir(
capsys: pytest.CaptureFixture,
tmp_path: Path,
) -> None:
with chdir(tmp_path):
run_cli(
[
"extract-images",
str(RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"),
"--output-dir",
str(tmp_path),
]
)
captured = capsys.readouterr()
assert not captured.err
assert "Extracted 3 images" in captured.out
assert f"Stored in {tmp_path}" in captured.out
Loading