@@ -84,11 +84,14 @@ def _apply_ocr_success_updates(
8484 if column not in df_meta .columns :
8585 df_meta [column ] = None
8686
87+ filename_series = df_meta ["filename" ].astype (str )
88+ stem_series = filename_series .map (canonical_stem )
89+
8790 for fname in filenames :
88- mask = df_meta ["filename" ].astype (str ) == str (fname )
91+ stem = canonical_stem (fname )
92+ mask = stem_series == stem
8993 if not bool (mask .any ()):
9094 continue
91- stem = canonical_stem (fname )
9295 artifact_update = _build_ocr_stage_artifact_update (
9396 markdown_dir = markdown_dir ,
9497 metrics_dir = metrics_dir ,
@@ -107,6 +110,27 @@ def _apply_ocr_success_updates(
107110 return df_meta
108111
109112
113+ def _normalize_ocr_target_filenames (* , filenames : List [str ], input_dir : Path ) -> List [str ]:
114+ """Collapse chunk-like metadata rows back to real OCR source files when possible."""
115+
116+ source_by_stem : Dict [str , str ] = {}
117+ try :
118+ for path in sorted (Path (input_dir ).glob ("*.pdf" )):
119+ source_by_stem .setdefault (canonical_stem (path .name ), path .name )
120+ except Exception :
121+ source_by_stem = {}
122+
123+ normalized : List [str ] = []
124+ seen : Set [str ] = set ()
125+ for fname in filenames :
126+ resolved = source_by_stem .get (canonical_stem (fname ), str (fname ))
127+ if resolved in seen :
128+ continue
129+ normalized .append (resolved )
130+ seen .add (resolved )
131+ return normalized
132+
133+
110134class OcrMathPhaseMixin :
111135 def ocr (
112136 self ,
@@ -137,6 +161,8 @@ def ocr(
137161 gpu_memory_utilization : Optional [float ] = None ,
138162 disable_fp8_kv : bool = False ,
139163 repair_mode : str = "auto" ,
164+ repair_exec_batch_target_pages : Optional [int ] = None ,
165+ repair_exec_batch_target_items : Optional [int ] = None ,
140166 scheduler : str = "auto" ,
141167 target_batch_pages : int = 160 ,
142168 shard_pages : int = 0 ,
@@ -196,8 +222,11 @@ def ocr(
196222 - vllm_batch_size/gpu_memory_utilization/disable_fp8_kv/repair_mode:
197223 Optional vLLM controls. ``repair_mode='auto'`` enables the markdown-first
198224 repair pipeline (plain fallback for garbage pages, tiled fallback for
199- short coverage failures). These are ignored by the transformers runtime
200- except for ``prompt_override``.
225+ short coverage failures). ``repair_exec_batch_target_pages`` and
226+ ``repair_exec_batch_target_items`` control how many pending repair rows
227+ a worker tries to execute together once the global repair phase begins.
228+ These are ignored by the transformers runtime except for
229+ ``prompt_override``.
201230 - force: [DEPRECATED] alias for fix_bad retained for backward compatibility.
202231 - reprocess_completed: when False, skip documents already flagged as successfully
203232 OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False
@@ -357,6 +386,17 @@ def ocr(
357386 removed ,
358387 )
359388 try :
389+ normalized_bad_files = _normalize_ocr_target_filenames (
390+ filenames = bad_files ,
391+ input_dir = Path (self .input_dir ),
392+ )
393+ if len (normalized_bad_files ) != len (bad_files ):
394+ self .logger .info (
395+ "OCR: collapsed %d metadata-selected row(s) onto %d real source PDF(s) by canonical stem." ,
396+ len (bad_files ),
397+ len (normalized_bad_files ),
398+ )
399+ bad_files = normalized_bad_files
360400 self .logger .info (
361401 "OCR targets: total=%d kept=%d skipped_completed=%d skipped_skiplist=%d" ,
362402 ocr_candidates_initial ,
@@ -727,6 +767,8 @@ def _run_math(stems: List[str]) -> None:
727767 gpu_memory_utilization = gpu_memory_utilization ,
728768 disable_fp8_kv = disable_fp8_kv ,
729769 repair_mode = repair_mode ,
770+ repair_exec_batch_target_pages = repair_exec_batch_target_pages ,
771+ repair_exec_batch_target_items = repair_exec_batch_target_items ,
730772 scheduler = scheduler ,
731773 target_batch_pages = int (max (1 , target_batch_pages )),
732774 shard_pages = int (max (0 , shard_pages )),
0 commit comments