Normalize OCR targets and expose repair packing

fffoivos · fffoivos · commit a8d2b93a9022 · 2026-04-03T19:13:44.000+03:00
diff --git a/src/glossapi/_naming.py b/src/glossapi/_naming.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 from pathlib import Path
+import re
 from typing import Union
 
 _KNOWN_SUFFIXES = (
@@ -19,6 +20,8 @@
     ".htm",
 )
 
+_PAGE_CHUNK_SUFFIX_RE = re.compile(r"__p\d{4,5}-\d{4,5}$")
+
 
 def canonical_stem(value: Union[str, Path]) -> str:
     """Return a normalised stem for any pipeline artefact."""
@@ -33,6 +36,7 @@ def canonical_stem(value: Union[str, Path]) -> str:
                 working = working[: -len(suffix)]
                 stripped = True
                 break
+    working = _PAGE_CHUNK_SUFFIX_RE.sub("", working)
     if working:
         return working
     fallback = Path(name).stem
diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py
@@ -84,11 +84,14 @@ def _apply_ocr_success_updates(
         if column not in df_meta.columns:
             df_meta[column] = None
 
+    filename_series = df_meta["filename"].astype(str)
+    stem_series = filename_series.map(canonical_stem)
+
     for fname in filenames:
-        mask = df_meta["filename"].astype(str) == str(fname)
+        stem = canonical_stem(fname)
+        mask = stem_series == stem
         if not bool(mask.any()):
             continue
-        stem = canonical_stem(fname)
         artifact_update = _build_ocr_stage_artifact_update(
             markdown_dir=markdown_dir,
             metrics_dir=metrics_dir,
@@ -107,6 +110,27 @@ def _apply_ocr_success_updates(
     return df_meta
 
 
+def _normalize_ocr_target_filenames(*, filenames: List[str], input_dir: Path) -> List[str]:
+    """Collapse chunk-like metadata rows back to real OCR source files when possible."""
+
+    source_by_stem: Dict[str, str] = {}
+    try:
+        for path in sorted(Path(input_dir).glob("*.pdf")):
+            source_by_stem.setdefault(canonical_stem(path.name), path.name)
+    except Exception:
+        source_by_stem = {}
+
+    normalized: List[str] = []
+    seen: Set[str] = set()
+    for fname in filenames:
+        resolved = source_by_stem.get(canonical_stem(fname), str(fname))
+        if resolved in seen:
+            continue
+        normalized.append(resolved)
+        seen.add(resolved)
+    return normalized
+
+
 class OcrMathPhaseMixin:
     def ocr(
         self,
@@ -137,6 +161,8 @@ def ocr(
         gpu_memory_utilization: Optional[float] = None,
         disable_fp8_kv: bool = False,
         repair_mode: str = "auto",
+        repair_exec_batch_target_pages: Optional[int] = None,
+        repair_exec_batch_target_items: Optional[int] = None,
         scheduler: str = "auto",
         target_batch_pages: int = 160,
         shard_pages: int = 0,
@@ -196,8 +222,11 @@ def ocr(
         - vllm_batch_size/gpu_memory_utilization/disable_fp8_kv/repair_mode:
           Optional vLLM controls. ``repair_mode='auto'`` enables the markdown-first
           repair pipeline (plain fallback for garbage pages, tiled fallback for
-          short coverage failures). These are ignored by the transformers runtime
-          except for ``prompt_override``.
+          short coverage failures). ``repair_exec_batch_target_pages`` and
+          ``repair_exec_batch_target_items`` control how many pending repair rows
+          a worker tries to execute together once the global repair phase begins.
+          These are ignored by the transformers runtime except for
+          ``prompt_override``.
         - force: [DEPRECATED] alias for fix_bad retained for backward compatibility.
         - reprocess_completed: when False, skip documents already flagged as successfully
           OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False
@@ -357,6 +386,17 @@ def ocr(
                     removed,
                 )
         try:
+            normalized_bad_files = _normalize_ocr_target_filenames(
+                filenames=bad_files,
+                input_dir=Path(self.input_dir),
+            )
+            if len(normalized_bad_files) != len(bad_files):
+                self.logger.info(
+                    "OCR: collapsed %d metadata-selected row(s) onto %d real source PDF(s) by canonical stem.",
+                    len(bad_files),
+                    len(normalized_bad_files),
+                )
+            bad_files = normalized_bad_files
             self.logger.info(
                 "OCR targets: total=%d kept=%d skipped_completed=%d skipped_skiplist=%d",
                 ocr_candidates_initial,
@@ -727,6 +767,8 @@ def _run_math(stems: List[str]) -> None:
                         gpu_memory_utilization=gpu_memory_utilization,
                         disable_fp8_kv=disable_fp8_kv,
                         repair_mode=repair_mode,
+                        repair_exec_batch_target_pages=repair_exec_batch_target_pages,
+                        repair_exec_batch_target_items=repair_exec_batch_target_items,
                         scheduler=scheduler,
                         target_batch_pages=int(max(1, target_batch_pages)),
                         shard_pages=int(max(0, shard_pages)),
diff --git a/src/glossapi/scripts/full_pipeline_checkpoint.py b/src/glossapi/scripts/full_pipeline_checkpoint.py
@@ -57,6 +57,8 @@ def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace:
     p.add_argument("--ocr-devices", nargs="*", type=int, default=None)
     p.add_argument("--ocr-workers-per-gpu", type=int, default=1)
     p.add_argument("--ocr-vllm-batch-size", type=int, default=None)
+    p.add_argument("--ocr-repair-exec-batch-target-pages", type=int, default=None)
+    p.add_argument("--ocr-repair-exec-batch-target-items", type=int, default=None)
     p.add_argument("--ocr-target-batch-pages", type=int, default=160)
     p.add_argument("--ocr-render-dpi", type=int, default=None)
     p.add_argument("--ocr-scheduler", default="auto")
@@ -165,6 +167,8 @@ def main(argv: Optional[List[str]] = None) -> int:
             devices=_parse_int_list(args.ocr_devices),
             workers_per_gpu=int(args.ocr_workers_per_gpu),
             vllm_batch_size=args.ocr_vllm_batch_size,
+            repair_exec_batch_target_pages=args.ocr_repair_exec_batch_target_pages,
+            repair_exec_batch_target_items=args.ocr_repair_exec_batch_target_items,
             target_batch_pages=int(args.ocr_target_batch_pages),
             render_dpi=args.ocr_render_dpi,
             scheduler=str(args.ocr_scheduler),
diff --git a/src/glossapi/scripts/openarchives_ocr_run_node.py b/src/glossapi/scripts/openarchives_ocr_run_node.py
@@ -53,6 +53,8 @@ def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace:
     p.add_argument("--max-new-tokens", type=int, default=2048)
     p.add_argument("--render-dpi", type=int, default=144)
     p.add_argument("--repair-mode", default="auto")
+    p.add_argument("--repair-exec-batch-target-pages", type=int, default=None)
+    p.add_argument("--repair-exec-batch-target-items", type=int, default=None)
     p.add_argument("--gpu-memory-utilization", type=float, default=0.9)
     return p.parse_args(argv)
 
@@ -348,6 +350,8 @@ def main(argv: Optional[List[str]] = None) -> int:
             render_dpi=int(args.render_dpi),
             max_new_tokens=int(args.max_new_tokens),
             repair_mode=str(args.repair_mode),
+            repair_exec_batch_target_pages=args.repair_exec_batch_target_pages,
+            repair_exec_batch_target_items=args.repair_exec_batch_target_items,
             scheduler=str(args.scheduler),
             target_batch_pages=int(args.target_batch_pages),
             shard_pages=int(args.shard_pages),
diff --git a/tests/test_full_pipeline_checkpoint.py b/tests/test_full_pipeline_checkpoint.py
@@ -148,3 +148,63 @@ def jsonl(self, output_path, **kwargs):
     assert report["post_extract_counts"]["needs_ocr_true"] == 1
     assert report["post_ocr_counts"]["ocr_success_true"] == 1
     assert report["export_records"] == 1
+
+
+def test_full_pipeline_checkpoint_forwards_repair_exec_batch_controls(tmp_path, monkeypatch):
+    captured = {}
+
+    class DummyCorpus:
+        def __init__(self, input_dir, output_dir):
+            self.input_dir = input_dir
+            self.output_dir = output_dir
+
+        def _metadata_path(self):
+            path = self.output_dir / "download_results" / "download_results.parquet"
+            path.parent.mkdir(parents=True, exist_ok=True)
+            return path
+
+        def extract(self, **kwargs):
+            pd.DataFrame(
+                [{"filename": "doc.pdf", "needs_ocr": True, "ocr_success": False, "text": ""}]
+            ).to_parquet(self._metadata_path(), index=False)
+
+        def clean(self, **kwargs):
+            return None
+
+        def ocr(self, **kwargs):
+            captured.update(kwargs)
+            pd.DataFrame(
+                [{"filename": "doc.pdf", "needs_ocr": False, "ocr_success": True, "text": "fixed text"}]
+            ).to_parquet(self._metadata_path(), index=False)
+
+        def jsonl(self, output_path, **kwargs):
+            output_path.write_text(json.dumps({"text": "fixed text"}) + "\n", encoding="utf-8")
+
+    monkeypatch.setattr(checkpoint, "Corpus", DummyCorpus)
+
+    input_dir = tmp_path / "in"
+    input_dir.mkdir()
+    output_dir = tmp_path / "out"
+    export_path = tmp_path / "export.jsonl"
+    report_path = tmp_path / "report.json"
+
+    rc = checkpoint.main(
+        [
+            "--input-dir",
+            str(input_dir),
+            "--output-dir",
+            str(output_dir),
+            "--export-path",
+            str(export_path),
+            "--report-path",
+            str(report_path),
+            "--ocr-repair-exec-batch-target-pages",
+            "64",
+            "--ocr-repair-exec-batch-target-items",
+            "24",
+        ]
+    )
+
+    assert rc == 0
+    assert captured["repair_exec_batch_target_pages"] == 64
+    assert captured["repair_exec_batch_target_items"] == 24
diff --git a/tests/test_metadata_fallback.py b/tests/test_metadata_fallback.py
@@ -210,6 +210,8 @@ def test_canonical_stem_variants():
         "beta.metrics.json": "beta",
         "gamma.per_page.metrics.json": "gamma",
         "delta.with.dots.pdf": "delta.with.dots",
+        "needs__p0001-0002.pdf": "needs",
+        "needs__p00001-00096.md": "needs",
     }
     for source, expected in cases.items():
         assert canonical_stem(source) == expected
diff --git a/tests/test_ocr_backends_smoke.py b/tests/test_ocr_backends_smoke.py
@@ -73,3 +73,44 @@ def fake_enrich(files=None, **kwargs):
         == hashlib.sha256(b"ds md\n").hexdigest()
     )
     assert captured.get("files") == ["clean"], "Math-only should run for non-OCR stem only"
+
+
+def test_deepseek_ocr_normalizes_chunk_rows_to_real_source_pdf(tmp_path, monkeypatch):
+    corpus = _mk_corpus(tmp_path)
+
+    (corpus.input_dir / "needs.pdf").write_bytes(b"%PDF-1.4\n%stub\n")
+
+    dl_dir = corpus.output_dir / "download_results"
+    dl_dir.mkdir(parents=True, exist_ok=True)
+    parquet_path = dl_dir / "download_results.parquet"
+    pd.DataFrame(
+        [
+            {"filename": "needs.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False},
+            {"filename": "needs__p0001-0002.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False},
+        ]
+    ).to_parquet(parquet_path, index=False)
+
+    from glossapi.ocr.deepseek import runner
+
+    captured = {}
+
+    def fake_run_for_files(self_ref, files, **kwargs):
+        captured["files"] = list(files)
+        markdown_dir = corpus.output_dir / "markdown"
+        metrics_dir = corpus.output_dir / "json" / "metrics"
+        markdown_dir.mkdir(parents=True, exist_ok=True)
+        metrics_dir.mkdir(parents=True, exist_ok=True)
+        (markdown_dir / "needs.md").write_text("normalized md\n", encoding="utf-8")
+        (metrics_dir / "needs.metrics.json").write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8")
+        return {"needs": {"page_count": 1}}
+
+    monkeypatch.setattr(runner, "run_for_files", fake_run_for_files)
+
+    corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False, mode="ocr_bad")
+
+    assert captured["files"] == ["needs.pdf"]
+    updated = pd.read_parquet(parquet_path).set_index("filename")
+    assert bool(updated.loc["needs.pdf", "ocr_success"]) is True
+    assert bool(updated.loc["needs__p0001-0002.pdf", "ocr_success"]) is True
+    assert updated.loc["needs.pdf", "text"] == "normalized md\n"
+    assert updated.loc["needs__p0001-0002.pdf", "text"] == "normalized md\n"
diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py
@@ -51,6 +51,43 @@ def fail_math(*args, **kwargs):
     assert calls.get("files") == [fname]
 
 
+def test_deepseek_backend_forwards_repair_exec_batch_controls(tmp_path, monkeypatch):
+    corpus = _mk_corpus(tmp_path)
+
+    dl_dir = corpus.output_dir / "download_results"
+    dl_dir.mkdir(parents=True, exist_ok=True)
+    fname = "doc.pdf"
+    df = pd.DataFrame([
+        {"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}
+    ])
+    df.to_parquet(dl_dir / "download_results.parquet", index=False)
+    (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%stub\n")
+
+    from glossapi.ocr.deepseek import runner
+
+    calls = {}
+
+    def fake_run_for_files(self_ref, files, **kwargs):
+        calls["files"] = list(files)
+        calls["kwargs"] = dict(kwargs)
+        return {"doc": {"page_count": 1}}
+
+    monkeypatch.setattr(runner, "run_for_files", fake_run_for_files)
+
+    corpus.ocr(
+        backend="deepseek",
+        fix_bad=True,
+        math_enhance=False,
+        mode="ocr_bad",
+        repair_exec_batch_target_pages=64,
+        repair_exec_batch_target_items=24,
+    )
+
+    assert calls.get("files") == [fname]
+    assert calls["kwargs"]["repair_exec_batch_target_pages"] == 64
+    assert calls["kwargs"]["repair_exec_batch_target_items"] == 24
+
+
 def test_invalid_backend_is_rejected(tmp_path):
     corpus = _mk_corpus(tmp_path)
     with pytest.raises(ValueError, match="backend must be 'deepseek'"):

Original file line number	Diff line number	Diff line change
`@@ -210,6 +210,8 @@ def test_canonical_stem_variants():`
`210`	`210`	`"beta.metrics.json": "beta",`
`211`	`211`	`"gamma.per_page.metrics.json": "gamma",`
`212`	`212`	`"delta.with.dots.pdf": "delta.with.dots",`
	`213`	`+ "needs__p0001-0002.pdf": "needs",`
	`214`	`+ "needs__p00001-00096.md": "needs",`
`213`	`215`	`}`
`214`	`216`	`for source, expected in cases.items():`
`215`	`217`	`assert canonical_stem(source) == expected`