Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 5 additions & 130 deletions api/apps/document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,14 @@
#
import os.path
import re
from pathlib import Path, PurePosixPath, PureWindowsPath
from pathlib import PurePosixPath, PureWindowsPath

from quart import make_response, request

from api.apps import current_user, login_required
from api.common.check_team_permission import check_kb_team_permission
from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX
from api.constants import IMG_BASE64_PREFIX
from api.db import FileType
from api.db.db_models import Task
from api.db.services import duplicate_name
from api.db.services.document_service import DocumentService, doc_upload_and_parse
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
Expand All @@ -37,12 +35,11 @@
server_error_response,
validate_request,
)
from api.utils.file_utils import filename_type, thumbnail
from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers, html2pdf, is_valid_url
from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers, is_valid_url
from common import settings
from common.constants import SANDBOX_ARTIFACT_BUCKET, ParserType, RetCode, TaskStatus
from common.constants import SANDBOX_ARTIFACT_BUCKET, RetCode, TaskStatus
from common.file_utils import get_project_base_directory
from common.misc_utils import get_uuid, thread_pool_exec
from common.misc_utils import thread_pool_exec
from common.ssrf_guard import assert_url_is_safe
from deepdoc.parser.html_parser import RAGFlowHtmlParser
from rag.nlp import search
Expand All @@ -60,128 +57,6 @@ def _is_safe_download_filename(name: str) -> bool:
return True


@manager.route("/web_crawl", methods=["POST"]) # noqa: F821
@login_required
@validate_request("kb_id", "name", "url")
async def web_crawl():
form = await request.form
kb_id = form.get("kb_id")
if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
name = form.get("name")
url = form.get("url")
if not is_valid_url(url):
return get_json_result(data=False, message="The URL format is invalid", code=RetCode.ARGUMENT_ERROR)
e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
raise LookupError("Can't find this dataset!")
if not check_kb_team_permission(kb, current_user.id):
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)

blob = html2pdf(url)
if not blob:
return server_error_response(ValueError("Download failure."))

root_folder = FileService.get_root_folder(current_user.id)
pf_id = root_folder["id"]
FileService.init_knowledgebase_docs(pf_id, current_user.id)
kb_root_folder = FileService.get_kb_folder(current_user.id)
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])

try:
filename = duplicate_name(DocumentService.query, name=name + ".pdf", kb_id=kb.id)
filetype = filename_type(filename)
if filetype == FileType.OTHER.value:
raise RuntimeError("This type of file has not been supported yet!")

location = filename
while settings.STORAGE_IMPL.obj_exist(kb_id, location):
location += "_"
settings.STORAGE_IMPL.put(kb_id, location, blob)
doc = {
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"parser_config": kb.parser_config,
"created_by": current_user.id,
"type": filetype,
"name": filename,
"location": location,
"size": len(blob),
"thumbnail": thumbnail(filename, blob),
"suffix": Path(filename).suffix.lstrip("."),
}
if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value
if doc["type"] == FileType.AURAL:
doc["parser_id"] = ParserType.AUDIO.value
if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value
if re.search(r"\.(eml)$", filename):
doc["parser_id"] = ParserType.EMAIL.value
DocumentService.insert(doc)
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
except Exception as e:
return server_error_response(e)
return get_json_result(data=True)


@manager.route("/create", methods=["POST"]) # noqa: F821
@login_required
@validate_request("name", "kb_id")
async def create():
req = await get_request_json()
kb_id = req["kb_id"]
if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
if len(req["name"].encode("utf-8")) > FILE_NAME_LEN_LIMIT:
return get_json_result(data=False, message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", code=RetCode.ARGUMENT_ERROR)

if req["name"].strip() == "":
return get_json_result(data=False, message="File name can't be empty.", code=RetCode.ARGUMENT_ERROR)
req["name"] = req["name"].strip()

try:
e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
return get_data_error_result(message="Can't find this dataset!")

if DocumentService.query(name=req["name"], kb_id=kb_id):
return get_data_error_result(message="Duplicated document name in the same dataset.")

kb_root_folder = FileService.get_kb_folder(kb.tenant_id)
if not kb_root_folder:
return get_data_error_result(message="Cannot find the root folder.")
kb_folder = FileService.new_a_file_from_kb(
kb.tenant_id,
kb.name,
kb_root_folder["id"],
)
if not kb_folder:
return get_data_error_result(message="Cannot find the kb folder for this file.")

doc = DocumentService.insert(
{
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"pipeline_id": kb.pipeline_id,
"parser_config": kb.parser_config,
"created_by": current_user.id,
"type": FileType.VIRTUAL,
"name": req["name"],
"suffix": Path(req["name"]).suffix.lstrip("."),
"location": "",
"size": 0,
}
)

FileService.add_file_from_kb(doc.to_dict(), kb_folder["id"], kb.tenant_id)

return get_json_result(data=doc.to_json())
except Exception as e:
return server_error_response(e)


@manager.route("/thumbnails", methods=["GET"]) # noqa: F821
# @login_required
Expand Down
166 changes: 144 additions & 22 deletions api/apps/restful_apis/document_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#
import logging
import json
import re
from pathlib import Path

from quart import request
from peewee import OperationalError
Expand All @@ -23,8 +25,9 @@
from api.apps import login_required
from api.apps.services.document_api_service import validate_document_update_fields, map_doc_keys, \
map_doc_keys_with_run_status, update_document_name_only, update_chunk_method_only, update_document_status_only
from api.constants import IMG_BASE64_PREFIX
from api.db import VALID_FILE_TYPES
from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX
from api.db import FileType, VALID_FILE_TYPES
from api.db.services import duplicate_name
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.db_models import Task
from api.db.services.document_service import DocumentService
Expand All @@ -38,9 +41,11 @@
UpdateDocumentReq, format_validation_error_message, validate_and_parse_json_request, DeleteDocumentReq,
)
from common import settings
from common.constants import RetCode, TaskStatus
from common.constants import ParserType, RetCode, TaskStatus
from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema
from common.misc_utils import thread_pool_exec
from common.misc_utils import get_uuid, thread_pool_exec
from api.utils.file_utils import filename_type, thumbnail
from api.utils.web_utils import html2pdf, is_valid_url
from rag.nlp import search

@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PATCH"]) # noqa: F821
Expand Down Expand Up @@ -348,13 +353,144 @@ async def upload_document(dataset_id, tenant_id):
type: string
description: Processing status.
"""
from api.constants import FILE_NAME_LEN_LIMIT
from api.db.services.file_service import FileService
upload_type = (request.args.get("type") or "local").lower()
e, kb = KnowledgebaseService.get_by_id(dataset_id)
if not e:
logging.error(f"Can't find the dataset with ID {dataset_id}!")
return get_error_data_result(message=f"Can't find the dataset with ID {dataset_id}!", code=RetCode.DATA_ERROR)

if not check_kb_team_permission(kb, tenant_id):
logging.error("No authorization.")
return get_error_data_result(message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)

if upload_type == "web":
return await _upload_web_document(dataset_id, kb, tenant_id)

if upload_type == "empty":
return await _upload_empty_document(dataset_id, kb, tenant_id)

if upload_type != "local":
return get_error_data_result(
message='`type` must be one of "local", "web", or "empty".',
code=RetCode.ARGUMENT_ERROR,
)

return await _upload_local_documents(kb, tenant_id)


async def _upload_web_document(dataset_id, kb, tenant_id):
form = await request.form
files = await request.files
name = (form.get("name") or "").strip()
url = form.get("url")

if not name:
return get_error_data_result(message='Lack of "name"', code=RetCode.ARGUMENT_ERROR)
if not url:
return get_error_data_result(message='Lack of "url"', code=RetCode.ARGUMENT_ERROR)
if len(name.encode("utf-8")) > FILE_NAME_LEN_LIMIT:
return get_error_data_result(
message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.",
code=RetCode.ARGUMENT_ERROR,
)
if not is_valid_url(url):
return get_error_data_result(message="The URL format is invalid", code=RetCode.ARGUMENT_ERROR)

blob = html2pdf(url)
if not blob:
return server_error_response(ValueError("Download failure."))
Comment on lines +398 to +400
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Blocking Selenium call on the async event loop.

html2pdf(url) launches headless Chrome via Selenium (see api/utils/web_utils.py:157-188) and can take many seconds. Running it synchronously inside this async handler blocks the Quart event loop for every concurrent request. _upload_local_documents already offloads heavy work via thread_pool_exec — do the same here (and ideally also for the subsequent STORAGE_IMPL.obj_exist / STORAGE_IMPL.put which are blocking I/O).

🛠️ Suggested change
-    blob = html2pdf(url)
+    blob = await thread_pool_exec(html2pdf, url)
     if not blob:
         return server_error_response(ValueError("Download failure."))
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
blob = html2pdf(url)
if not blob:
return server_error_response(ValueError("Download failure."))
blob = await thread_pool_exec(html2pdf, url)
if not blob:
return server_error_response(ValueError("Download failure."))
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 313 - 315, The handler
currently calls html2pdf(url) synchronously which blocks the async event loop;
update the async handler to run html2pdf via the existing thread_pool_exec
helper (same pattern as _upload_local_documents) and await its result, and
likewise offload the blocking STORAGE_IMPL.obj_exist and STORAGE_IMPL.put calls
to thread_pool_exec so all CPU/IO-heavy work is executed on the thread pool;
locate the synchronous calls to html2pdf, STORAGE_IMPL.obj_exist and
STORAGE_IMPL.put in the async function and wrap them with thread_pool_exec
(await the returned futures) to avoid blocking the Quart event loop.


root_folder = FileService.get_root_folder(tenant_id)
FileService.init_knowledgebase_docs(root_folder["id"], tenant_id)
kb_root_folder = FileService.get_kb_folder(tenant_id)
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
Comment thread
buua436 marked this conversation as resolved.

try:
filename = duplicate_name(DocumentService.query, name=f"{name}.pdf", kb_id=kb.id)
filetype = filename_type(filename)
if filetype == FileType.OTHER.value:
raise RuntimeError("This type of file has not been supported yet!")

location = filename
while settings.STORAGE_IMPL.obj_exist(dataset_id, location):
location += "_"
settings.STORAGE_IMPL.put(dataset_id, location, blob)

doc = {
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"pipeline_id": kb.pipeline_id,
"parser_config": kb.parser_config,
"created_by": tenant_id,
"type": filetype,
"name": filename,
"location": location,
"size": len(blob),
"thumbnail": thumbnail(filename, blob),
"suffix": Path(filename).suffix.lstrip("."),
}
if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value
if doc["type"] == FileType.AURAL:
doc["parser_id"] = ParserType.AUDIO.value
if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value
if re.search(r"\.(eml)$", filename):
doc["parser_id"] = ParserType.EMAIL.value
Comment on lines +432 to +439
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Inspect FileType enum definition and filename_type return type
ast-grep --pattern $'class FileType($_):
  $$$'
rg -nP --type=py -C3 '\bdef\s+filename_type\s*\('

Repository: infiniflow/ragflow

Length of output: 824


🏁 Script executed:

cd api/apps/restful_apis && sed -n '320,360p' document_api.py | cat -n

Repository: infiniflow/ragflow

Length of output: 2058


🏁 Script executed:

rg -n "def filename_type" api/utils/file_utils.py -A 15

Repository: infiniflow/ragflow

Length of output: 827


🏁 Script executed:

grep -n "FileType" api/apps/restful_apis/document_api.py | head -20

Repository: infiniflow/ragflow

Length of output: 299


🏁 Script executed:

sed -n '58,85p' api/utils/file_utils.py | cat -n

Repository: infiniflow/ragflow

Length of output: 1404


🏁 Script executed:

rg "new_kb_from_web_page" api/apps/restful_apis/document_api.py -B 5

Repository: infiniflow/ragflow

Length of output: 44


Remove this dead code block — filename is always .pdf, making these branches unreachable.

The filename is constructed as f"{name}.pdf" (line 323), so filename_type(filename) always returns FileType.PDF.value. This means:

  • doc["type"] is always "pdf" and can never equal FileType.VISUAL or FileType.AURAL
  • The regex patterns checking for .pptx and .eml can never match a .pdf filename

These overrides are dead code and should be removed entirely. The default kb.parser_id set on line 336 is what actually applies here.

If presentation/email handling is needed for crawled pages, recover the original source filename from the URL and run the type checks against that instead.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 347 - 354, The
conditional block that sets doc["parser_id"] for FileType.VISUAL, FileType.AURAL
and regex checks for .ppt/.pptx/.pages/.eml is dead because filename is always
constructed as f"{name}.pdf" and doc["type"] will always be FileType.PDF; remove
this entire block (the if branches referencing doc["type"],
FileType.VISUAL/FileType.AURAL, and the re.search checks) so the default
kb.parser_id remains in effect (see references to filename construction and
doc["type"] and parser_id). If you actually need to handle presentations/emails
for crawled pages, change the logic to recover the original source filename from
the URL and run the type checks against that source filename instead of the
generated f"{name}.pdf".


DocumentService.insert(doc)
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
return get_result(data=map_doc_keys_with_run_status(doc, run_status="0"))
except Exception as e:
return server_error_response(e)
Comment on lines +407 to +445
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Orphaned storage blob on partial failure + wrong error code for unsupported type.

Two smaller correctness/UX points inside the try block:

  • If STORAGE_IMPL.put (line 331) succeeds but DocumentService.insert / add_file_from_kb later fails, the blob stays in object storage with no DB record referencing it. Consider ordering the DB insert before the storage write, or adding a compensating STORAGE_IMPL.rm(dataset_id, location) in an exception path.
  • RuntimeError("This type of file has not been supported yet!") (line 326) is a client-input error, but the bare except at line 359 maps it to server_error_response (HTTP 500). Return a get_error_data_result(..., code=RetCode.ARGUMENT_ERROR) for that specific case so clients can distinguish it from server faults. (This case becomes unreachable if you drop the dead parser_id block suggested above — in that case this is moot.)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 322 - 360, The blob can
be orphaned if STORAGE_IMPL.put succeeds but later DocumentService.insert or
FileService.add_file_from_kb fails, and unsupported-type RuntimeError is being
turned into a 500 by the broad except; to fix: (1) change the flow in the
function so you either insert the DB record (DocumentService.insert) before
calling STORAGE_IMPL.put, or keep the current order but wrap the DB calls in a
try/except that on any failure calls settings.STORAGE_IMPL.rm(dataset_id,
location) to delete the uploaded blob; (2) replace the thrown RuntimeError for
unsupported file types with an explicit early return using
get_error_data_result(..., code=RetCode.ARGUMENT_ERROR) (or catch RuntimeError
specifically and map it to get_error_data_result) so clients receive an argument
error instead of server_error_response; update references to STORAGE_IMPL.put,
settings.STORAGE_IMPL.rm, DocumentService.insert, FileService.add_file_from_kb,
RuntimeError, server_error_response and
get_error_data_result/RetCode.ARGUMENT_ERROR accordingly.


# Validation

async def _upload_empty_document(dataset_id, kb, tenant_id):
req = await get_request_json()
name = (req.get("name") or "").strip()

if not name:
return get_error_data_result(message="File name can't be empty.", code=RetCode.ARGUMENT_ERROR)
if len(name.encode("utf-8")) > FILE_NAME_LEN_LIMIT:
return get_error_data_result(
message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.",
code=RetCode.ARGUMENT_ERROR,
)
if DocumentService.query(name=name, kb_id=dataset_id):
return get_error_data_result(message="Duplicated document name in the same dataset.")

try:
kb_root_folder = FileService.get_kb_folder(kb.tenant_id)
if not kb_root_folder:
return get_error_data_result(message="Cannot find the root folder.")
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
if not kb_folder:
return get_error_data_result(message="Cannot find the kb folder for this file.")

doc = DocumentService.insert(
{
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"pipeline_id": kb.pipeline_id,
"parser_config": kb.parser_config,
"created_by": tenant_id,
"type": FileType.VIRTUAL,
"name": name,
"suffix": Path(name).suffix.lstrip("."),
"location": "",
"size": 0,
}
)
Comment on lines +470 to +484
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Confirm how `type` is persisted / compared for documents across the codebase
ast-grep --pattern $'class FileType($_):
  $$$'
rg -nP --type=py -C2 '"type"\s*:\s*FileType\.[A-Z_]+\b'
rg -nP --type=py -C2 '\btype\s*==\s*FileType\.[A-Z_]+\.value\b'

Repository: infiniflow/ragflow

Length of output: 7291


Store FileType.VIRTUAL.value, not the enum member.

FileType is a StrEnum, so enum members are strings internally. However, throughout the codebase—in file inserts (user_account_service.py:88, file_service.py:205, file_api_service.py:326) and all type comparisons (document_service.py:256/532/553, file_service.py:396, file2document_api.py:103, file_api_service.py:236/255/313, file_api.py:294, document_app.py:227)—the pattern is consistently to use .value. This location is the only exception and should align with the standard pattern.

Suggested change
-                "type": FileType.VIRTUAL,
+                "type": FileType.VIRTUAL.value,
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
doc = DocumentService.insert(
{
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"pipeline_id": kb.pipeline_id,
"parser_config": kb.parser_config,
"created_by": current_user.id,
"type": FileType.VIRTUAL,
"name": name,
"suffix": Path(name).suffix.lstrip("."),
"location": "",
"size": 0,
}
)
doc = DocumentService.insert(
{
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"pipeline_id": kb.pipeline_id,
"parser_config": kb.parser_config,
"created_by": current_user.id,
"type": FileType.VIRTUAL.value,
"name": name,
"suffix": Path(name).suffix.lstrip("."),
"location": "",
"size": 0,
}
)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 385 - 399, The
DocumentService.insert call is storing the enum member FileType.VIRTUAL instead
of its string value; update the inserted dict so the "type" field uses
FileType.VIRTUAL.value to match the rest of the codebase (see usage patterns in
document_service and file_service) so comparisons and stored values remain
consistent.

FileService.add_file_from_kb(doc.to_dict(), kb_folder["id"], kb.tenant_id)
return get_result(data=map_doc_keys(doc))
except Exception as e:
return server_error_response(e)


async def _upload_local_documents(kb, tenant_id):
form = await request.form
files = await request.files
if "file" not in files:
logging.error("No file part!")
return get_error_data_result(message="No file part!", code=RetCode.ARGUMENT_ERROR)
Expand All @@ -369,18 +505,6 @@ async def upload_document(dataset_id, tenant_id):
logging.error(msg)
return get_error_data_result(message=msg, code=RetCode.ARGUMENT_ERROR)

# KB Lookup
e, kb = KnowledgebaseService.get_by_id(dataset_id)
if not e:
logging.error(f"Can't find the dataset with ID {dataset_id}!")
return get_error_data_result(message=f"Can't find the dataset with ID {dataset_id}!", code=RetCode.DATA_ERROR)

# Permission Check
if not check_kb_team_permission(kb, tenant_id):
logging.error("No authorization.")
return get_error_data_result(message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)

# File Upload (async)
err, files = await thread_pool_exec(
FileService.upload_document, kb, file_objs, tenant_id,
parent_path=form.get("parent_path")
Expand All @@ -396,8 +520,6 @@ async def upload_document(dataset_id, tenant_id):
return get_error_data_result(message=msg, code=RetCode.DATA_ERROR)

files = [f[0] for f in files] # remove the blob

# Check if we should return raw files without document key mapping
return_raw_files = request.args.get("return_raw_files", "false").lower() == "true"

if return_raw_files:
Expand Down
Loading
Loading