Skip to content
98 changes: 73 additions & 25 deletions deepdoc/parser/docling_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class RAGFlowPdfParser:
from deepdoc.parser.utils import extract_pdf_outlines



class DoclingContentType(str, Enum):
IMAGE = "image"
TABLE = "table"
Expand Down Expand Up @@ -372,37 +373,43 @@ def _parse_pdf_remote(

filename = Path(filepath).name or "input.pdf"
b64 = base64.b64encode(pdf_bytes).decode("ascii")

# Standard payloads
v1_payload = {
"options": {
"from_formats": ["pdf"],
"to_formats": ["json", "md", "text"],
},
"sources": [
{
"kind": "file",
"filename": filename,
"base64_string": b64,
}
],
"options": {"from_formats": ["pdf"], "to_formats": ["json", "md", "text"]},
"sources": [{"kind": "file", "filename": filename, "base64_string": b64}],
}
v1alpha_payload = {
"options": {"from_formats": ["pdf"], "to_formats": ["json", "md", "text"]},
"file_sources": [{"filename": filename, "base64_string": b64}],
}

# --- NEW: Chunking Payload ---
# Docling-serve's native chunking endpoint parameters
chunking_payload = {
"sources": [{"kind": "file", "filename": filename, "base64_string": b64}],
"options": {
"from_formats": ["pdf"],
"to_formats": ["json", "md", "text"],
},
"file_sources": [
{
"filename": filename,
"base64_string": b64,
}
],
"max_tokens": 512, # A safe default to prevent embedding overflows
"overlap_tokens": 50
}
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

errors = []
response_json = None
for endpoint, payload in (
("/v1/convert/source", v1_payload),
("/v1alpha/convert/source", v1alpha_payload),
is_chunked_response = False
chunking_failed_hard = False # Track if chunking failed for a real reason

# We prioritize the new chunking endpoints first!
for endpoint, payload, chunk_flag in (
("/v1/chunk/source", chunking_payload, True), # New stable chunking
("/v1alpha/chunk/source", chunking_payload, True), # New alpha chunking
("/v1/convert/source", v1_payload, False), # Fallback to standard
("/v1alpha/convert/source", v1alpha_payload, False), # Fallback to alpha standard
):
# If native chunking had a real error (not 404), skip the fallback endpoints
if not chunk_flag and chunking_failed_hard:
break

try:
resp = requests.post(
f"{server_url}{endpoint}",
Expand All @@ -411,20 +418,61 @@ def _parse_pdf_remote(
)
if resp.status_code < 300:
response_json = resp.json()
is_chunked_response = chunk_flag

# --- ADDED: Explicit logging for the server admins ---
if chunk_flag:
self.logger.info(f"[Docling] Successfully routed to native chunking endpoint: {endpoint}")
else:
self.logger.info(f"[Docling] Native chunking unavailable, fell back to standard convert: {endpoint}")
break

if chunk_flag and resp.status_code != 404:
self.logger.error(f"[Docling] Chunking failed with {resp.status_code}: {resp.text[:300]}")
errors.append(f"{endpoint}: HTTP {resp.status_code}")
chunking_failed_hard = True
continue # Let it try the alpha chunking endpoint before giving up!

errors.append(f"{endpoint}: HTTP {resp.status_code} {resp.text[:300]}")

except Exception as exc:
self.logger.error(f"[Docling] Request error on {endpoint}: {exc}")
errors.append(f"{endpoint}: {exc}")
if chunk_flag:
chunking_failed_hard = True
continue

if response_json is None:
raise RuntimeError("[Docling] remote convert failed: " + " | ".join(errors))

sections: list[tuple[str, ...]] = []
tables = []

# --- NEW: Handle Native Chunked Response ---
if is_chunked_response:
# The chunking endpoint returns an array of chunk items
chunks = response_json if isinstance(response_json, list) else response_json.get("results", [])
for chunk_data in chunks:
if not isinstance(chunk_data, dict):
continue
# Depending on the exact docling-serve spec, the text might be nested
chunk_text = chunk_data.get("text", "")
if not chunk_text and isinstance(chunk_data.get("chunk"), dict):
chunk_text = chunk_data["chunk"].get("text", "")
Comment thread
coderabbitai[bot] marked this conversation as resolved.

if isinstance(chunk_text, str) and chunk_text.strip():
# Feed the pre-sliced chunks directly into RAGFlow's expected format
sections.extend(self._sections_from_remote_text(chunk_text, parse_method=parse_method))
Comment thread
coderabbitai[bot] marked this conversation as resolved.

if callback:
callback(0.95, f"[Docling] Native chunks received: {len(sections)}")
return sections, tables
Comment thread
ParasSondhi marked this conversation as resolved.

# --- FALLBACK: Standard RAGFlow parsing for older docling servers ---
docs = self._extract_remote_document_entries(response_json)
if not docs:
raise RuntimeError("[Docling] remote response does not contain parsed documents.")

sections: list[tuple[str, ...]] = []
tables = []
for doc in docs:
md = doc.get("md_content")
txt = doc.get("text_content")
Expand Down