Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ All notable changes to this project are documented in this file. This changelog

## [Unreleased]

### Added

- Added typed LibreOffice workbook handles and session-scoped workbook lifecycle tracking so rich extraction can reuse cached bridge payloads safely and reject foreign or closed workbook handles.

### Fixed

- Fixed LibreOffice rich backend workbook lifecycle integration so custom `session_factory` implementations that only support legacy path-based `extract_chart_geometries()` and `extract_draw_page_shapes()` continue to work without `load_workbook()` and `close_workbook()` hooks.

## [0.7.1] - 2026-03-21

### Added
Expand Down
96 changes: 91 additions & 5 deletions src/exstruct/core/backends/libreoffice_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,18 @@
from __future__ import annotations

from collections.abc import Callable, Sequence
from contextlib import AbstractContextManager
import logging
import math
from pathlib import Path
from typing import Protocol, TypeGuard

from ...models import Arrow, Chart, Shape, SmartArt
from ..libreoffice import (
LibreOfficeChartGeometry,
LibreOfficeDrawPageShape,
LibreOfficeSession,
LibreOfficeWorkbookHandle,
)
from ..ooxml_drawing import OoxmlConnectorInfo, OoxmlShapeInfo, read_sheet_drawings
from ..shapes import angle_to_compass, compute_line_angle_deg
Expand All @@ -20,14 +23,63 @@
logger = logging.getLogger(__name__)


class _LibreOfficePathSessionProtocol(Protocol):
"""Structural contract for legacy path-only rich-extraction sessions."""

def extract_chart_geometries(
self, file_path: Path
) -> dict[str, list[LibreOfficeChartGeometry]]:
"""Extract chart geometries for a workbook path."""

def extract_draw_page_shapes(
self, file_path: Path
) -> dict[str, list[LibreOfficeDrawPageShape]]:
"""Extract draw-page shapes for a workbook path."""


class _LibreOfficeWorkbookLifecycleSessionProtocol(Protocol):
"""Structural contract for sessions that support typed workbook handles."""

def load_workbook(self, file_path: Path) -> LibreOfficeWorkbookHandle:
"""Open a workbook lifecycle handle."""

def close_workbook(self, workbook: LibreOfficeWorkbookHandle) -> None:
"""Close a workbook lifecycle handle."""

def extract_chart_geometries(
self, workbook: Path | LibreOfficeWorkbookHandle
) -> dict[str, list[LibreOfficeChartGeometry]]:
"""Extract chart geometries for a path or workbook handle."""

def extract_draw_page_shapes(
self, workbook: Path | LibreOfficeWorkbookHandle
) -> dict[str, list[LibreOfficeDrawPageShape]]:
"""Extract draw-page shapes for a path or workbook handle."""


_LibreOfficeRichSession = (
_LibreOfficePathSessionProtocol | _LibreOfficeWorkbookLifecycleSessionProtocol
)


def _supports_workbook_lifecycle(
session: _LibreOfficeRichSession,
) -> TypeGuard[_LibreOfficeWorkbookLifecycleSessionProtocol]:
"""Return whether a session exposes the typed workbook lifecycle hooks."""

return hasattr(session, "load_workbook") and hasattr(session, "close_workbook")


class LibreOfficeRichBackend(RichBackend):
"""Best-effort rich extraction backend gated by LibreOffice runtime availability."""

def __init__(
self,
file_path: Path,
*,
session_factory: Callable[[], LibreOfficeSession] = LibreOfficeSession.from_env,
session_factory: Callable[
[], AbstractContextManager[_LibreOfficeRichSession]
] = (LibreOfficeSession.from_env),
) -> None:
"""Store the workbook path and session factory used for lazy LibreOffice extraction."""

Expand Down Expand Up @@ -150,8 +202,10 @@ def _read_chart_geometries(self) -> dict[str, list[LibreOfficeChartGeometry]]:
return self._chart_geometries
with self._session_factory() as session:
if hasattr(session, "extract_chart_geometries"):
self._chart_geometries = session.extract_chart_geometries(
self.file_path
self._chart_geometries = (
self._extract_chart_geometries_with_optional_workbook_lifecycle(
session=session,
)
)
else:
self._chart_geometries = {}
Expand All @@ -164,13 +218,45 @@ def _read_draw_page_shapes(self) -> dict[str, list[LibreOfficeDrawPageShape]]:
return self._draw_page_shapes
with self._session_factory() as session:
if hasattr(session, "extract_draw_page_shapes"):
self._draw_page_shapes = session.extract_draw_page_shapes(
self.file_path
self._draw_page_shapes = (
self._extract_draw_page_shapes_with_optional_workbook_lifecycle(
session=session,
)
)
else:
self._draw_page_shapes = {}
return self._draw_page_shapes

def _extract_chart_geometries_with_optional_workbook_lifecycle(
self,
*,
session: _LibreOfficeRichSession,
) -> dict[str, list[LibreOfficeChartGeometry]]:
"""Call chart extraction while preserving legacy path-only sessions."""

if not _supports_workbook_lifecycle(session):
return session.extract_chart_geometries(self.file_path)
workbook = session.load_workbook(self.file_path)
try:
return session.extract_chart_geometries(workbook)
finally:
session.close_workbook(workbook)

def _extract_draw_page_shapes_with_optional_workbook_lifecycle(
self,
*,
session: _LibreOfficeRichSession,
) -> dict[str, list[LibreOfficeDrawPageShape]]:
"""Call draw-page extraction while preserving legacy path-only sessions."""

if not _supports_workbook_lifecycle(session):
return session.extract_draw_page_shapes(self.file_path)
workbook = session.load_workbook(self.file_path)
try:
return session.extract_draw_page_shapes(workbook)
finally:
session.close_workbook(workbook)
Comment thread
coderabbitai[bot] marked this conversation as resolved.


def _build_shapes_from_ooxml(
shapes: Sequence[OoxmlShapeInfo],
Expand Down
122 changes: 108 additions & 14 deletions src/exstruct/core/libreoffice.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import sys
from tempfile import NamedTemporaryFile, mkdtemp
import time
from typing import Literal, TextIO, cast
from typing import Literal, TextIO, cast, overload

_DEFAULT_STARTUP_TIMEOUT_SEC = 15.0
_DEFAULT_EXEC_TIMEOUT_SEC = 30.0
Expand Down Expand Up @@ -81,6 +81,15 @@ class LibreOfficeSessionConfig:
profile_root: Path | None


@dataclass(frozen=True)
class LibreOfficeWorkbookHandle:
"""Typed workbook token scoped to one ``LibreOfficeSession`` instance."""

file_path: Path
owner_session_id: int
workbook_id: int


@dataclass(frozen=True)
class _LibreOfficeStartupAttempt:
"""Configuration for one LibreOffice startup strategy."""
Expand Down Expand Up @@ -129,7 +138,9 @@ def __init__(self, config: LibreOfficeSessionConfig) -> None:
self._soffice_process: subprocess.Popen[str] | None = None
self._accept_port: int | None = None
self._python_path = _resolve_python_path(config.soffice_path)
self._bridge_payload_cache: dict[str, object] = {}
self._bridge_payload_cache: dict[tuple[str, Path], object] = {}
self._open_workbooks: dict[int, Path] = {}
self._next_workbook_id = 0
self._soffice_stderr_sink: TextIO | None = None
self._soffice_stderr_path: Path | None = None

Expand Down Expand Up @@ -215,32 +226,114 @@ def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
self._soffice_stderr_sink = None
self._soffice_stderr_path = None
self._accept_port = None
self._bridge_payload_cache.clear()
self._open_workbooks.clear()
if self._temp_profile_dir is not None:
_cleanup_profile_dir(self._temp_profile_dir)
self._temp_profile_dir = None

def load_workbook(self, file_path: Path) -> object:
"""Return a lightweight workbook token for future subprocess integration."""
return {"file_path": str(file_path.resolve())}
def load_workbook(self, file_path: Path) -> LibreOfficeWorkbookHandle:
"""Register a workbook path and return a typed session-local handle."""

def close_workbook(self, workbook: object) -> None:
"""Close a workbook token returned by ``load_workbook``."""
_ = workbook
resolved = file_path.resolve()
self._next_workbook_id += 1
handle = LibreOfficeWorkbookHandle(
file_path=resolved,
owner_session_id=id(self),
workbook_id=self._next_workbook_id,
)
Comment thread
harumiWeb marked this conversation as resolved.
self._open_workbooks[handle.workbook_id] = resolved
return handle

def close_workbook(self, workbook: LibreOfficeWorkbookHandle) -> None:
"""Release a typed workbook handle and clear any stale workbook cache."""

file_path = self._require_handle_path(workbook, allow_closed=True)
if file_path is None:
return
self._open_workbooks.pop(workbook.workbook_id, None)
if file_path not in self._open_workbooks.values():
self._clear_workbook_cache(file_path)

def extract_draw_page_shapes(
self, file_path: Path
self, workbook: Path | LibreOfficeWorkbookHandle
) -> dict[str, list[LibreOfficeDrawPageShape]]:
"""Extract best-effort draw-page shapes from the workbook."""
payload = self._run_bridge(file_path, kind="draw-page")
payload = self._run_bridge(
self._resolve_workbook_path(workbook),
kind="draw-page",
)
return _parse_draw_page_payload(payload)

def extract_chart_geometries(
self, file_path: Path
self, workbook: Path | LibreOfficeWorkbookHandle
) -> dict[str, list[LibreOfficeChartGeometry]]:
"""Extract chart geometry candidates from the workbook draw pages."""
payload = self._run_bridge(file_path, kind="charts")
payload = self._run_bridge(
self._resolve_workbook_path(workbook),
kind="charts",
)
return _parse_chart_payload(payload)

def _resolve_workbook_path(
self, workbook: Path | LibreOfficeWorkbookHandle
) -> Path:
"""Resolve either a direct path or a typed handle into a workbook path."""

if isinstance(workbook, LibreOfficeWorkbookHandle):
return self._require_handle_path(workbook)
return workbook.resolve()

@overload
def _require_handle_path(
self,
workbook: LibreOfficeWorkbookHandle,
*,
allow_closed: Literal[False] = False,
) -> Path: ...

@overload
def _require_handle_path(
self,
workbook: LibreOfficeWorkbookHandle,
*,
allow_closed: Literal[True],
) -> Path | None: ...

def _require_handle_path(
self,
workbook: LibreOfficeWorkbookHandle,
*,
allow_closed: bool = False,
) -> Path | None:
"""Validate workbook ownership and return the registered file path."""

if not isinstance(workbook, LibreOfficeWorkbookHandle):
raise TypeError(
"LibreOfficeSession workbook lifecycle expects a "
"LibreOfficeWorkbookHandle."
)
if workbook.owner_session_id != id(self):
raise ValueError(
"LibreOffice workbook handle belongs to a different LibreOfficeSession."
)
file_path = self._open_workbooks.get(workbook.workbook_id)
if file_path is None and not allow_closed:
raise RuntimeError("LibreOffice workbook handle is closed.")
if file_path is not None and file_path != workbook.file_path.resolve():
raise ValueError(
"LibreOffice workbook handle does not match the registered "
"workbook path."
)
return file_path
Comment thread
coderabbitai[bot] marked this conversation as resolved.

def _clear_workbook_cache(self, file_path: Path) -> None:
"""Drop cached bridge payloads for one workbook path."""

stale_keys = [key for key in self._bridge_payload_cache if key[1] == file_path]
for key in stale_keys:
self._bridge_payload_cache.pop(key, None)

def _run_bridge(
self,
file_path: Path,
Expand All @@ -263,15 +356,16 @@ def _run_bridge(

if self._accept_port is None or self._python_path is None:
raise RuntimeError("LibreOfficeSession must be entered before extraction.")
cache_key = f"{kind}:{file_path.resolve()}"
resolved = file_path.resolve()
cache_key = (kind, resolved)
if cache_key in self._bridge_payload_cache:
return self._bridge_payload_cache[cache_key]
try:
completed = _run_bridge_extract_subprocess(
python_path=self._python_path,
host="127.0.0.1",
port=self._accept_port,
file_path=file_path.resolve(),
file_path=resolved,
kind=kind,
timeout_sec=self.config.exec_timeout_sec,
)
Expand Down
43 changes: 43 additions & 0 deletions tasks/feature_spec.md
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,46 @@

- `not-needed`
- rationale: this adds a single public repository policy document without changing architecture, public API design, or long-lived internal tradeoff policy.

## 2026-04-16 issue #77 LibreOffice typed workbook handle

### Goal

- Replace the raw `dict` token returned by `LibreOfficeSession.load_workbook()` with a typed workbook handle.
- Give `LibreOfficeSession.close_workbook()` meaningful session-local cleanup instead of a no-op.
- Keep the current LibreOffice extraction mode, fallback behavior, and bridge subprocess lifecycle unchanged.

### Contract summary

- `LibreOfficeSession.load_workbook()` returns a frozen typed handle that stores the resolved workbook path and the owning session identity.
- `LibreOfficeSession.close_workbook()` validates that the handle belongs to the current session, rejects rehydrated handles whose `file_path` no longer matches the registered workbook id, becomes idempotent for repeated close attempts, and clears any session-local bridge cache entries for that workbook.
- `LibreOfficeSession.extract_draw_page_shapes()` and `extract_chart_geometries()` continue to support path-based extraction, but may also consume the typed workbook handle so callers can follow a typed lifecycle.
- `LibreOfficeRichBackend.session_factory` accepts the structural rich-extraction session contract, including legacy path-only sessions and lifecycle-aware sessions, rather than only the concrete built-in `LibreOfficeSession`.
- No public CLI, MCP, extraction-mode, fallback, or serialization contracts change in this issue.

### Permanent destinations

- `src/exstruct/core/libreoffice.py`
- Canonical implementation for the typed LibreOffice workbook handle and close semantics.
- `src/exstruct/core/backends/libreoffice_backend.py`
- Updated to consume the typed session lifecycle without changing backend policy.
- `tests/core/test_libreoffice_backend.py`
- Regression coverage for typed handle behavior, ownership checks, idempotent close, and cache invalidation.
- `tasks/feature_spec.md` and `tasks/todo.md`
- Retain the compact planning and verification record for this issue.

### Constraints

- Do not change the bridge subprocess contract in `src/exstruct/core/_libreoffice_bridge.py`; workbook documents are still opened and closed per bridge invocation.
- Do not change backend fallback policy or session startup/shutdown behavior.
- Keep backward compatibility for current path-based extraction helpers while introducing the typed handle.

### Verification

- `uv run pytest tests/core/test_libreoffice_backend.py -q`
- `uv run task precommit-run`

### ADR verdict

- `not-needed`
- rationale: this is an internal contract hardening change that preserves existing extraction policy and runtime behavior; the durable rationale can stay in the task record.
5 changes: 5 additions & 0 deletions tasks/lessons.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,8 @@
## 2026-03-21 skill installation docs lessons

- When a repo-owned Skill is intended to be publicly installable from the repository, document the `npx skills add <repo>/<skill-root> --skill <name>` path as the primary install route instead of only manual copy steps.

## 2026-04-16 session_factory compatibility lessons

- When tightening a public or semi-public extension point such as `session_factory`, preserve the previously accepted minimal method surface unless the compatibility contract is explicitly changed.
- If a new lifecycle hook is optional for the default implementation, add a regression test for a legacy custom implementation that lacks that hook before reporting completion.
Loading
Loading