infiniflow · wangq8 · Apr 24, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py
@@ -44,6 +44,7 @@ class RAGFlowPdfParser:
 from deepdoc.parser.utils import extract_pdf_outlines
 
 
+
 class DoclingContentType(str, Enum):
     IMAGE = "image"
     TABLE = "table"
@@ -372,37 +373,43 @@ def _parse_pdf_remote(
 
         filename = Path(filepath).name or "input.pdf"
         b64 = base64.b64encode(pdf_bytes).decode("ascii")
+
+        # Standard payloads
         v1_payload = {
-            "options": {
-                "from_formats": ["pdf"],
-                "to_formats": ["json", "md", "text"],
-            },
-            "sources": [
-                {
-                    "kind": "file",
-                    "filename": filename,
-                    "base64_string": b64,
-                }
-            ],
+            "options": {"from_formats": ["pdf"], "to_formats": ["json", "md", "text"]},
+            "sources": [{"kind": "file", "filename": filename, "base64_string": b64}],
         }
         v1alpha_payload = {
+            "options": {"from_formats": ["pdf"], "to_formats": ["json", "md", "text"]},
+            "file_sources": [{"filename": filename, "base64_string": b64}],
+        }
+
+        # --- NEW: Chunking Payload ---
+        # Docling-serve's native chunking endpoint parameters
+        chunking_payload = {
+            "sources": [{"kind": "file", "filename": filename, "base64_string": b64}],
             "options": {
-                "from_formats": ["pdf"],
-                "to_formats": ["json", "md", "text"],
-            },
-            "file_sources": [
-                {
-                    "filename": filename,
-                    "base64_string": b64,
-                }
-            ],
+                "max_tokens": 512, # A safe default to prevent embedding overflows
+                "overlap_tokens": 50
+            }
         }
+
         errors = []
         response_json = None
-        for endpoint, payload in (
-            ("/v1/convert/source", v1_payload),
-            ("/v1alpha/convert/source", v1alpha_payload),
+        is_chunked_response = False
+        chunking_failed_hard = False  # Track if chunking failed for a real reason
+
+        # We prioritize the new chunking endpoints first!
+        for endpoint, payload, chunk_flag in (
+            ("/v1/chunk/source", chunking_payload, True),          # New stable chunking
+            ("/v1alpha/chunk/source", chunking_payload, True),     # New alpha chunking
+            ("/v1/convert/source", v1_payload, False),             # Fallback to standard
+            ("/v1alpha/convert/source", v1alpha_payload, False),   # Fallback to alpha standard
         ):
+            # If native chunking had a real error (not 404), skip the fallback endpoints
+            if not chunk_flag and chunking_failed_hard:
+                break
+
             try:
                 resp = requests.post(
                     f"{server_url}{endpoint}",
@@ -411,20 +418,61 @@ def _parse_pdf_remote(
                 )
                 if resp.status_code < 300:
                     response_json = resp.json()
+                    is_chunked_response = chunk_flag
+
+                    # --- ADDED: Explicit logging for the server admins ---
+                    if chunk_flag:
+                        self.logger.info(f"[Docling] Successfully routed to native chunking endpoint: {endpoint}")
+                    else:
+                        self.logger.info(f"[Docling] Native chunking unavailable, fell back to standard convert: {endpoint}")
                     break
+
+                if chunk_flag and resp.status_code != 404:
+                    self.logger.error(f"[Docling] Chunking failed with {resp.status_code}: {resp.text[:300]}")
+                    errors.append(f"{endpoint}: HTTP {resp.status_code}")
+                    chunking_failed_hard = True
+                    continue  # Let it try the alpha chunking endpoint before giving up!
+
                 errors.append(f"{endpoint}: HTTP {resp.status_code} {resp.text[:300]}")
+
             except Exception as exc:
+                self.logger.error(f"[Docling] Request error on {endpoint}: {exc}")
                 errors.append(f"{endpoint}: {exc}")
+                if chunk_flag:
+                    chunking_failed_hard = True
+                    continue
 
         if response_json is None:
             raise RuntimeError("[Docling] remote convert failed: " + " | ".join(errors))
 
+        sections: list[tuple[str, ...]] = []
+        tables = []
+
+        # --- NEW: Handle Native Chunked Response ---
+        if is_chunked_response:
+            # The chunking endpoint returns an array of chunk items
+            chunks = response_json if isinstance(response_json, list) else response_json.get("results", [])
+            for chunk_data in chunks:
+                if not isinstance(chunk_data, dict):
+                    continue
+                # Depending on the exact docling-serve spec, the text might be nested
+                chunk_text = chunk_data.get("text", "")
+                if not chunk_text and isinstance(chunk_data.get("chunk"), dict):
+                    chunk_text = chunk_data["chunk"].get("text", "")
+
+                if isinstance(chunk_text, str) and chunk_text.strip():
+                    # Feed the pre-sliced chunks directly into RAGFlow's expected format
+                    sections.extend(self._sections_from_remote_text(chunk_text, parse_method=parse_method))
+
+            if callback:
+                callback(0.95, f"[Docling] Native chunks received: {len(sections)}")
+            return sections, tables
+
+        # --- FALLBACK: Standard RAGFlow parsing for older docling servers ---
         docs = self._extract_remote_document_entries(response_json)
         if not docs:
             raise RuntimeError("[Docling] remote response does not contain parsed documents.")
 
-        sections: list[tuple[str, ...]] = []
-        tables = []
         for doc in docs:
             md = doc.get("md_content")
             txt = doc.get("text_content")