diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 15ec26dd42d..501b6906833 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -15,16 +15,14 @@ # import os.path import re -from pathlib import Path, PurePosixPath, PureWindowsPath +from pathlib import PurePosixPath, PureWindowsPath from quart import make_response, request from api.apps import current_user, login_required -from api.common.check_team_permission import check_kb_team_permission -from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX +from api.constants import IMG_BASE64_PREFIX from api.db import FileType from api.db.db_models import Task -from api.db.services import duplicate_name from api.db.services.document_service import DocumentService, doc_upload_and_parse from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService @@ -37,12 +35,11 @@ server_error_response, validate_request, ) -from api.utils.file_utils import filename_type, thumbnail -from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers, html2pdf, is_valid_url +from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers, is_valid_url from common import settings -from common.constants import SANDBOX_ARTIFACT_BUCKET, ParserType, RetCode, TaskStatus +from common.constants import SANDBOX_ARTIFACT_BUCKET, RetCode, TaskStatus from common.file_utils import get_project_base_directory -from common.misc_utils import get_uuid, thread_pool_exec +from common.misc_utils import thread_pool_exec from common.ssrf_guard import assert_url_is_safe from deepdoc.parser.html_parser import RAGFlowHtmlParser from rag.nlp import search @@ -60,128 +57,6 @@ def _is_safe_download_filename(name: str) -> bool: return True -@manager.route("/web_crawl", methods=["POST"]) # noqa: F821 -@login_required -@validate_request("kb_id", "name", "url") -async def web_crawl(): - form = await request.form - kb_id = form.get("kb_id") - if not kb_id: - return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR) - name = form.get("name") - url = form.get("url") - if not is_valid_url(url): - return get_json_result(data=False, message="The URL format is invalid", code=RetCode.ARGUMENT_ERROR) - e, kb = KnowledgebaseService.get_by_id(kb_id) - if not e: - raise LookupError("Can't find this dataset!") - if not check_kb_team_permission(kb, current_user.id): - return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR) - - blob = html2pdf(url) - if not blob: - return server_error_response(ValueError("Download failure.")) - - root_folder = FileService.get_root_folder(current_user.id) - pf_id = root_folder["id"] - FileService.init_knowledgebase_docs(pf_id, current_user.id) - kb_root_folder = FileService.get_kb_folder(current_user.id) - kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"]) - - try: - filename = duplicate_name(DocumentService.query, name=name + ".pdf", kb_id=kb.id) - filetype = filename_type(filename) - if filetype == FileType.OTHER.value: - raise RuntimeError("This type of file has not been supported yet!") - - location = filename - while settings.STORAGE_IMPL.obj_exist(kb_id, location): - location += "_" - settings.STORAGE_IMPL.put(kb_id, location, blob) - doc = { - "id": get_uuid(), - "kb_id": kb.id, - "parser_id": kb.parser_id, - "parser_config": kb.parser_config, - "created_by": current_user.id, - "type": filetype, - "name": filename, - "location": location, - "size": len(blob), - "thumbnail": thumbnail(filename, blob), - "suffix": Path(filename).suffix.lstrip("."), - } - if doc["type"] == FileType.VISUAL: - doc["parser_id"] = ParserType.PICTURE.value - if doc["type"] == FileType.AURAL: - doc["parser_id"] = ParserType.AUDIO.value - if re.search(r"\.(ppt|pptx|pages)$", filename): - doc["parser_id"] = ParserType.PRESENTATION.value - if re.search(r"\.(eml)$", filename): - doc["parser_id"] = ParserType.EMAIL.value - DocumentService.insert(doc) - FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id) - except Exception as e: - return server_error_response(e) - return get_json_result(data=True) - - -@manager.route("/create", methods=["POST"]) # noqa: F821 -@login_required -@validate_request("name", "kb_id") -async def create(): - req = await get_request_json() - kb_id = req["kb_id"] - if not kb_id: - return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR) - if len(req["name"].encode("utf-8")) > FILE_NAME_LEN_LIMIT: - return get_json_result(data=False, message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", code=RetCode.ARGUMENT_ERROR) - - if req["name"].strip() == "": - return get_json_result(data=False, message="File name can't be empty.", code=RetCode.ARGUMENT_ERROR) - req["name"] = req["name"].strip() - - try: - e, kb = KnowledgebaseService.get_by_id(kb_id) - if not e: - return get_data_error_result(message="Can't find this dataset!") - - if DocumentService.query(name=req["name"], kb_id=kb_id): - return get_data_error_result(message="Duplicated document name in the same dataset.") - - kb_root_folder = FileService.get_kb_folder(kb.tenant_id) - if not kb_root_folder: - return get_data_error_result(message="Cannot find the root folder.") - kb_folder = FileService.new_a_file_from_kb( - kb.tenant_id, - kb.name, - kb_root_folder["id"], - ) - if not kb_folder: - return get_data_error_result(message="Cannot find the kb folder for this file.") - - doc = DocumentService.insert( - { - "id": get_uuid(), - "kb_id": kb.id, - "parser_id": kb.parser_id, - "pipeline_id": kb.pipeline_id, - "parser_config": kb.parser_config, - "created_by": current_user.id, - "type": FileType.VIRTUAL, - "name": req["name"], - "suffix": Path(req["name"]).suffix.lstrip("."), - "location": "", - "size": 0, - } - ) - - FileService.add_file_from_kb(doc.to_dict(), kb_folder["id"], kb.tenant_id) - - return get_json_result(data=doc.to_json()) - except Exception as e: - return server_error_response(e) - @manager.route("/thumbnails", methods=["GET"]) # noqa: F821 # @login_required diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py index 8098dbec8c5..3055ca87079 100644 --- a/api/apps/restful_apis/document_api.py +++ b/api/apps/restful_apis/document_api.py @@ -15,6 +15,8 @@ # import logging import json +import re +from pathlib import Path from quart import request from peewee import OperationalError @@ -23,8 +25,9 @@ from api.apps import login_required from api.apps.services.document_api_service import validate_document_update_fields, map_doc_keys, \ map_doc_keys_with_run_status, update_document_name_only, update_chunk_method_only, update_document_status_only -from api.constants import IMG_BASE64_PREFIX -from api.db import VALID_FILE_TYPES +from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX +from api.db import FileType, VALID_FILE_TYPES +from api.db.services import duplicate_name from api.db.services.doc_metadata_service import DocMetadataService from api.db.db_models import Task from api.db.services.document_service import DocumentService @@ -38,9 +41,11 @@ UpdateDocumentReq, format_validation_error_message, validate_and_parse_json_request, DeleteDocumentReq, ) from common import settings -from common.constants import RetCode, TaskStatus +from common.constants import ParserType, RetCode, TaskStatus from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema -from common.misc_utils import thread_pool_exec +from common.misc_utils import get_uuid, thread_pool_exec +from api.utils.file_utils import filename_type, thumbnail +from api.utils.web_utils import html2pdf, is_valid_url from rag.nlp import search @manager.route("/datasets//documents/", methods=["PATCH"]) # noqa: F821 @@ -348,13 +353,144 @@ async def upload_document(dataset_id, tenant_id): type: string description: Processing status. """ - from api.constants import FILE_NAME_LEN_LIMIT - from api.db.services.file_service import FileService + upload_type = (request.args.get("type") or "local").lower() + e, kb = KnowledgebaseService.get_by_id(dataset_id) + if not e: + logging.error(f"Can't find the dataset with ID {dataset_id}!") + return get_error_data_result(message=f"Can't find the dataset with ID {dataset_id}!", code=RetCode.DATA_ERROR) + + if not check_kb_team_permission(kb, tenant_id): + logging.error("No authorization.") + return get_error_data_result(message="No authorization.", code=RetCode.AUTHENTICATION_ERROR) + + if upload_type == "web": + return await _upload_web_document(dataset_id, kb, tenant_id) + + if upload_type == "empty": + return await _upload_empty_document(dataset_id, kb, tenant_id) + + if upload_type != "local": + return get_error_data_result( + message='`type` must be one of "local", "web", or "empty".', + code=RetCode.ARGUMENT_ERROR, + ) + + return await _upload_local_documents(kb, tenant_id) + +async def _upload_web_document(dataset_id, kb, tenant_id): form = await request.form - files = await request.files + name = (form.get("name") or "").strip() + url = form.get("url") + + if not name: + return get_error_data_result(message='Lack of "name"', code=RetCode.ARGUMENT_ERROR) + if not url: + return get_error_data_result(message='Lack of "url"', code=RetCode.ARGUMENT_ERROR) + if len(name.encode("utf-8")) > FILE_NAME_LEN_LIMIT: + return get_error_data_result( + message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", + code=RetCode.ARGUMENT_ERROR, + ) + if not is_valid_url(url): + return get_error_data_result(message="The URL format is invalid", code=RetCode.ARGUMENT_ERROR) + + blob = html2pdf(url) + if not blob: + return server_error_response(ValueError("Download failure.")) + + root_folder = FileService.get_root_folder(tenant_id) + FileService.init_knowledgebase_docs(root_folder["id"], tenant_id) + kb_root_folder = FileService.get_kb_folder(tenant_id) + kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"]) + + try: + filename = duplicate_name(DocumentService.query, name=f"{name}.pdf", kb_id=kb.id) + filetype = filename_type(filename) + if filetype == FileType.OTHER.value: + raise RuntimeError("This type of file has not been supported yet!") + + location = filename + while settings.STORAGE_IMPL.obj_exist(dataset_id, location): + location += "_" + settings.STORAGE_IMPL.put(dataset_id, location, blob) + + doc = { + "id": get_uuid(), + "kb_id": kb.id, + "parser_id": kb.parser_id, + "pipeline_id": kb.pipeline_id, + "parser_config": kb.parser_config, + "created_by": tenant_id, + "type": filetype, + "name": filename, + "location": location, + "size": len(blob), + "thumbnail": thumbnail(filename, blob), + "suffix": Path(filename).suffix.lstrip("."), + } + if doc["type"] == FileType.VISUAL: + doc["parser_id"] = ParserType.PICTURE.value + if doc["type"] == FileType.AURAL: + doc["parser_id"] = ParserType.AUDIO.value + if re.search(r"\.(ppt|pptx|pages)$", filename): + doc["parser_id"] = ParserType.PRESENTATION.value + if re.search(r"\.(eml)$", filename): + doc["parser_id"] = ParserType.EMAIL.value + + DocumentService.insert(doc) + FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id) + return get_result(data=map_doc_keys_with_run_status(doc, run_status="0")) + except Exception as e: + return server_error_response(e) - # Validation + +async def _upload_empty_document(dataset_id, kb, tenant_id): + req = await get_request_json() + name = (req.get("name") or "").strip() + + if not name: + return get_error_data_result(message="File name can't be empty.", code=RetCode.ARGUMENT_ERROR) + if len(name.encode("utf-8")) > FILE_NAME_LEN_LIMIT: + return get_error_data_result( + message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", + code=RetCode.ARGUMENT_ERROR, + ) + if DocumentService.query(name=name, kb_id=dataset_id): + return get_error_data_result(message="Duplicated document name in the same dataset.") + + try: + kb_root_folder = FileService.get_kb_folder(kb.tenant_id) + if not kb_root_folder: + return get_error_data_result(message="Cannot find the root folder.") + kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"]) + if not kb_folder: + return get_error_data_result(message="Cannot find the kb folder for this file.") + + doc = DocumentService.insert( + { + "id": get_uuid(), + "kb_id": kb.id, + "parser_id": kb.parser_id, + "pipeline_id": kb.pipeline_id, + "parser_config": kb.parser_config, + "created_by": tenant_id, + "type": FileType.VIRTUAL, + "name": name, + "suffix": Path(name).suffix.lstrip("."), + "location": "", + "size": 0, + } + ) + FileService.add_file_from_kb(doc.to_dict(), kb_folder["id"], kb.tenant_id) + return get_result(data=map_doc_keys(doc)) + except Exception as e: + return server_error_response(e) + + +async def _upload_local_documents(kb, tenant_id): + form = await request.form + files = await request.files if "file" not in files: logging.error("No file part!") return get_error_data_result(message="No file part!", code=RetCode.ARGUMENT_ERROR) @@ -369,18 +505,6 @@ async def upload_document(dataset_id, tenant_id): logging.error(msg) return get_error_data_result(message=msg, code=RetCode.ARGUMENT_ERROR) - # KB Lookup - e, kb = KnowledgebaseService.get_by_id(dataset_id) - if not e: - logging.error(f"Can't find the dataset with ID {dataset_id}!") - return get_error_data_result(message=f"Can't find the dataset with ID {dataset_id}!", code=RetCode.DATA_ERROR) - - # Permission Check - if not check_kb_team_permission(kb, tenant_id): - logging.error("No authorization.") - return get_error_data_result(message="No authorization.", code=RetCode.AUTHENTICATION_ERROR) - - # File Upload (async) err, files = await thread_pool_exec( FileService.upload_document, kb, file_objs, tenant_id, parent_path=form.get("parent_path") @@ -396,8 +520,6 @@ async def upload_document(dataset_id, tenant_id): return get_error_data_result(message=msg, code=RetCode.DATA_ERROR) files = [f[0] for f in files] # remove the blob - - # Check if we should return raw files without document key mapping return_raw_files = request.args.get("return_raw_files", "false").lower() == "true" if return_raw_files: diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index a76fd2274e7..04d025ad458 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -1373,15 +1373,26 @@ Failure: Uploads documents to a specified dataset. +This endpoint supports three creation modes via the optional `type` query parameter: + +- `type=local` or omitted: Upload one or more local files using `multipart/form-data`. +- `type=web`: Crawl a web page and save it as a document. +- `type=empty`: Create an empty virtual document by name. + #### Request - Method: POST - URL: `/api/v1/datasets/{dataset_id}/documents` +- Query: + - `type`: Optional. One of `local`, `web`, or `empty`. Defaults to `local`. - Headers: - - `'Content-Type: multipart/form-data'` + - `'Content-Type: multipart/form-data'` for `type=local` and `type=web` + - `'Content-Type: application/json'` for `type=empty` - `'Authorization: Bearer '` -- Form: - - `'file=@{FILE_PATH}'` +- Body: + - For `type=local`: form field `'file=@{FILE_PATH}'` + - For `type=web`: form fields `'name'` and `'url'` + - For `type=empty`: JSON body with `'name'` ##### Request example @@ -1394,12 +1405,38 @@ curl --request POST \ --form 'file=@./test2.pdf' ``` +```bash +curl --request POST \ + --url 'http://{address}/api/v1/datasets/{dataset_id}/documents?type=web' \ + --header 'Content-Type: multipart/form-data' \ + --header 'Authorization: Bearer ' \ + --form 'name=example-page' \ + --form 'url=https://example.com' +``` + +```bash +curl --request POST \ + --url 'http://{address}/api/v1/datasets/{dataset_id}/documents?type=empty' \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer ' \ + --data '{"name":"blank.txt"}' +``` + ##### Request parameters - `dataset_id`: (*Path parameter*) The ID of the dataset to which the documents will be uploaded. +- `type`: (*Query parameter*) + Controls how the document is created: + - `local`: Upload files. + - `web`: Crawl a URL into a document. + - `empty`: Create an empty document without file upload. - `'file'`: (*Body parameter*) - A document to upload. + A document to upload. Required when `type=local`. +- `'name'`: (*Body parameter*) + The document name. Required when `type=web` or `type=empty`. +- `'url'`: (*Body parameter*) + The source URL to crawl. Required when `type=web`. #### Response diff --git a/test/testcases/test_web_api/test_common.py b/test/testcases/test_web_api/test_common.py index c0c84038be9..46ec8974a55 100644 --- a/test/testcases/test_web_api/test_common.py +++ b/test/testcases/test_web_api/test_common.py @@ -328,7 +328,16 @@ def upload_documents(auth, payload=None, files_path=None, *, filename_override=N def create_document(auth, payload=None, *, headers=HEADERS, data=None): - res = requests.post(url=f"{HOST_ADDRESS}{DOCUMENT_APP_URL}/create", headers=headers, auth=auth, json=payload, data=data) + kb_id = payload.get("kb_id") if payload else None + request_payload = dict(payload or {}) + request_payload.pop("kb_id", None) + res = requests.post( + url=f"{HOST_ADDRESS}{DATASETS_URL}/{kb_id}/documents?type=empty", + headers=headers, + auth=auth, + json=request_payload, + data=data, + ) return res.json() diff --git a/test/testcases/test_web_api/test_document_app/conftest.py b/test/testcases/test_web_api/test_document_app/conftest.py index b8cf461952c..5af8d262776 100644 --- a/test/testcases/test_web_api/test_document_app/conftest.py +++ b/test/testcases/test_web_api/test_document_app/conftest.py @@ -31,6 +31,14 @@ def decorator(func): return decorator +class _StubKBRecord(dict): + def __getattr__(self, item): + try: + return self[item] + except KeyError as exc: + raise AttributeError(item) from exc + + @pytest.fixture(scope="function") def add_document_func(request, WebApiAuth, add_dataset, ragflow_tmp_dir): def cleanup(): @@ -128,3 +136,97 @@ class _StubPaddleOCRParser: module.manager = _DummyManager() spec.loader.exec_module(module) return module + + +@pytest.fixture() +def document_rest_api_module(monkeypatch): + repo_root = Path(__file__).resolve().parents[4] + common_pkg = ModuleType("common") + common_pkg.__path__ = [str(repo_root / "common")] + monkeypatch.setitem(sys.modules, "common", common_pkg) + + deepdoc_pkg = ModuleType("deepdoc") + deepdoc_parser_pkg = ModuleType("deepdoc.parser") + deepdoc_parser_pkg.__path__ = [] + + class _StubPdfParser: + pass + + class _StubExcelParser: + pass + + deepdoc_parser_pkg.PdfParser = _StubPdfParser + deepdoc_pkg.parser = deepdoc_parser_pkg + monkeypatch.setitem(sys.modules, "deepdoc", deepdoc_pkg) + monkeypatch.setitem(sys.modules, "deepdoc.parser", deepdoc_parser_pkg) + deepdoc_excel_module = ModuleType("deepdoc.parser.excel_parser") + deepdoc_excel_module.RAGFlowExcelParser = _StubExcelParser + monkeypatch.setitem(sys.modules, "deepdoc.parser.excel_parser", deepdoc_excel_module) + deepdoc_html_module = ModuleType("deepdoc.parser.html_parser") + + class _StubHtmlParser: + pass + + deepdoc_html_module.RAGFlowHtmlParser = _StubHtmlParser + monkeypatch.setitem(sys.modules, "deepdoc.parser.html_parser", deepdoc_html_module) + deepdoc_mineru_module = ModuleType("deepdoc.parser.mineru_parser") + + class _StubMinerUParser: + pass + + deepdoc_mineru_module.MinerUParser = _StubMinerUParser + monkeypatch.setitem(sys.modules, "deepdoc.parser.mineru_parser", deepdoc_mineru_module) + deepdoc_paddleocr_module = ModuleType("deepdoc.parser.paddleocr_parser") + + class _StubPaddleOCRParser: + pass + + deepdoc_paddleocr_module.PaddleOCRParser = _StubPaddleOCRParser + monkeypatch.setitem(sys.modules, "deepdoc.parser.paddleocr_parser", deepdoc_paddleocr_module) + monkeypatch.setitem(sys.modules, "xgboost", ModuleType("xgboost")) + + stub_apps = ModuleType("api.apps") + stub_apps.__path__ = [str(repo_root / "api" / "apps")] + stub_apps.current_user = SimpleNamespace(id="user-1") + stub_apps.login_required = lambda func: func + monkeypatch.setitem(sys.modules, "api.apps", stub_apps) + + stub_apps_services = ModuleType("api.apps.services") + stub_apps_services.__path__ = [str(repo_root / "api" / "apps" / "services")] + monkeypatch.setitem(sys.modules, "api.apps.services", stub_apps_services) + + document_api_service_mod = ModuleType("api.apps.services.document_api_service") + document_api_service_mod.validate_document_update_fields = lambda *_args, **_kwargs: (None, None) + document_api_service_mod.map_doc_keys = lambda doc: doc.to_dict() if hasattr(doc, "to_dict") else doc + def _map_doc_keys_with_run_status(doc, run_status="0"): + payload = doc if isinstance(doc, dict) else doc.to_dict() + return {**payload, "run": run_status} + + document_api_service_mod.map_doc_keys_with_run_status = _map_doc_keys_with_run_status + document_api_service_mod.update_document_name_only = lambda *_args, **_kwargs: None + document_api_service_mod.update_chunk_method_only = lambda *_args, **_kwargs: None + document_api_service_mod.update_document_status_only = lambda *_args, **_kwargs: None + monkeypatch.setitem(sys.modules, "api.apps.services.document_api_service", document_api_service_mod) + + module_path = repo_root / "api" / "apps" / "restful_apis" / "document_api.py" + spec = importlib.util.spec_from_file_location("test_document_api_unit", module_path) + module = importlib.util.module_from_spec(spec) + module.manager = _DummyManager() + spec.loader.exec_module(module) + monkeypatch.setattr( + module.KnowledgebaseService, + "get_by_id", + lambda dataset_id: ( + True, + _StubKBRecord( + id=dataset_id, + tenant_id="tenant1", + name="kb", + parser_id="parser", + pipeline_id="pipe", + parser_config={}, + ), + ), + ) + monkeypatch.setattr(module, "check_kb_team_permission", lambda *_args, **_kwargs: True) + return module diff --git a/test/testcases/test_web_api/test_document_app/test_create_document.py b/test/testcases/test_web_api/test_document_app/test_create_document.py index 092c5e292f8..c40bbd91675 100644 --- a/test/testcases/test_web_api/test_document_app/test_create_document.py +++ b/test/testcases/test_web_api/test_document_app/test_create_document.py @@ -15,8 +15,8 @@ # import asyncio import string -from types import SimpleNamespace from concurrent.futures import ThreadPoolExecutor, as_completed +from types import SimpleNamespace import pytest from test_common import create_document, list_datasets @@ -26,6 +26,14 @@ from api.constants import FILE_NAME_LEN_LIMIT +class _StubKBRecord(dict): + def __getattr__(self, item): + try: + return self[item] + except KeyError as exc: + raise AttributeError(item) from exc + + @pytest.mark.p1 @pytest.mark.usefixtures("clear_datasets") class TestAuthorization: @@ -63,7 +71,7 @@ def test_filename_max_length(self, WebApiAuth, add_dataset_func, tmp_path): def test_invalid_kb_id(self, WebApiAuth): res = create_document(WebApiAuth, {"name": "ragflow_test.txt", "kb_id": "invalid_kb_id"}) assert res["code"] == 102, res - assert res["message"] == "Can't find this dataset!", res + assert res["message"] == "Can't find the dataset with ID invalid_kb_id!", res @pytest.mark.p3 def test_filename_special_characters(self, WebApiAuth, add_dataset_func): @@ -101,100 +109,95 @@ def _run(coro): @pytest.mark.p2 class TestDocumentCreateUnit: - def test_missing_kb_id(self, document_app_module, monkeypatch): - module = document_app_module - - async def fake_request_json(): - return {"kb_id": "", "name": "doc.txt"} - - monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.create.__wrapped__()) - assert res["code"] == 101 - assert res["message"] == 'Lack of "KB ID"' - - def test_filename_too_long(self, document_app_module, monkeypatch): - module = document_app_module + def test_filename_too_long(self, document_rest_api_module, monkeypatch): + module = document_rest_api_module long_name = "a" * (FILE_NAME_LEN_LIMIT + 1) async def fake_request_json(): - return {"kb_id": "kb1", "name": long_name} + return {"name": long_name} monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.create.__wrapped__()) + monkeypatch.setattr(module, "request", SimpleNamespace(args={"type": "empty"})) + res = _run(module.upload_document(dataset_id="kb1")) assert res["code"] == 101 assert res["message"] == f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less." - def test_filename_whitespace(self, document_app_module, monkeypatch): - module = document_app_module + def test_filename_whitespace(self, document_rest_api_module, monkeypatch): + module = document_rest_api_module async def fake_request_json(): - return {"kb_id": "kb1", "name": " "} + return {"name": " "} monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.create.__wrapped__()) + monkeypatch.setattr(module, "request", SimpleNamespace(args={"type": "empty"})) + res = _run(module.upload_document(dataset_id="kb1")) assert res["code"] == 101 assert res["message"] == "File name can't be empty." - def test_kb_not_found(self, document_app_module, monkeypatch): - module = document_app_module + def test_kb_not_found(self, document_rest_api_module, monkeypatch): + module = document_rest_api_module monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (False, None)) async def fake_request_json(): - return {"kb_id": "missing", "name": "doc.txt"} + return {"name": "doc.txt"} monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.create.__wrapped__()) + monkeypatch.setattr(module, "request", SimpleNamespace(args={"type": "empty"})) + res = _run(module.upload_document(dataset_id="missing")) assert res["code"] == 102 - assert res["message"] == "Can't find this dataset!" + assert res["message"] == "Can't find the dataset with ID missing!" - def test_duplicate_name(self, document_app_module, monkeypatch): - module = document_app_module - kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", pipeline_id="pipe", parser_config={}) + def test_duplicate_name(self, document_rest_api_module, monkeypatch): + module = document_rest_api_module + kb = _StubKBRecord(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", pipeline_id="pipe", parser_config={}) monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, kb)) monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [object()]) async def fake_request_json(): - return {"kb_id": "kb1", "name": "doc.txt"} + return {"name": "doc.txt"} monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.create.__wrapped__()) + monkeypatch.setattr(module, "request", SimpleNamespace(args={"type": "empty"})) + res = _run(module.upload_document(dataset_id="kb1")) assert res["code"] == 102 assert "Duplicated document name" in res["message"] - def test_root_folder_missing(self, document_app_module, monkeypatch): - module = document_app_module - kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", pipeline_id="pipe", parser_config={}) + def test_root_folder_missing(self, document_rest_api_module, monkeypatch): + module = document_rest_api_module + kb = _StubKBRecord(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", pipeline_id="pipe", parser_config={}) monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, kb)) monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: []) monkeypatch.setattr(module.FileService, "get_kb_folder", lambda *_args, **_kwargs: None) async def fake_request_json(): - return {"kb_id": "kb1", "name": "doc.txt"} + return {"name": "doc.txt"} monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.create.__wrapped__()) + monkeypatch.setattr(module, "request", SimpleNamespace(args={"type": "empty"})) + res = _run(module.upload_document(dataset_id="kb1")) assert res["code"] == 102 assert res["message"] == "Cannot find the root folder." - def test_kb_folder_missing(self, document_app_module, monkeypatch): - module = document_app_module - kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", pipeline_id="pipe", parser_config={}) + def test_kb_folder_missing(self, document_rest_api_module, monkeypatch): + module = document_rest_api_module + kb = _StubKBRecord(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", pipeline_id="pipe", parser_config={}) monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, kb)) monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: []) monkeypatch.setattr(module.FileService, "get_kb_folder", lambda *_args, **_kwargs: {"id": "root"}) monkeypatch.setattr(module.FileService, "new_a_file_from_kb", lambda *_args, **_kwargs: None) async def fake_request_json(): - return {"kb_id": "kb1", "name": "doc.txt"} + return {"name": "doc.txt"} monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.create.__wrapped__()) + monkeypatch.setattr(module, "request", SimpleNamespace(args={"type": "empty"})) + res = _run(module.upload_document(dataset_id="kb1")) assert res["code"] == 102 assert res["message"] == "Cannot find the kb folder for this file." - def test_success(self, document_app_module, monkeypatch): - module = document_app_module - kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", pipeline_id="pipe", parser_config={}) + def test_success(self, document_rest_api_module, monkeypatch): + module = document_rest_api_module + kb = _StubKBRecord(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", pipeline_id="pipe", parser_config={}) monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, kb)) monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: []) monkeypatch.setattr(module.FileService, "get_kb_folder", lambda *_args, **_kwargs: {"id": "root"}) @@ -214,9 +217,10 @@ def to_dict(self): monkeypatch.setattr(module.FileService, "add_file_from_kb", lambda *_args, **_kwargs: None) async def fake_request_json(): - return {"kb_id": "kb1", "name": "doc.txt"} + return {"name": "doc.txt"} monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.create.__wrapped__()) + monkeypatch.setattr(module, "request", SimpleNamespace(args={"type": "empty"})) + res = _run(module.upload_document(dataset_id="kb1")) assert res["code"] == 0 assert res["data"]["id"] == "doc1" diff --git a/test/testcases/test_web_api/test_document_app/test_upload_documents.py b/test/testcases/test_web_api/test_document_app/test_upload_documents.py index 93305ba9a4f..bb8d805772a 100644 --- a/test/testcases/test_web_api/test_document_app/test_upload_documents.py +++ b/test/testcases/test_web_api/test_document_app/test_upload_documents.py @@ -448,54 +448,64 @@ async def req_no_url(): @pytest.mark.p2 class TestWebCrawlUnit: - def test_missing_kb_id(self, document_app_module, monkeypatch): - module = document_app_module - monkeypatch.setattr(module, "request", _DummyRequest(form={"kb_id": "", "name": "doc", "url": "http://example.com"})) - res = _run(module.web_crawl.__wrapped__()) - assert res["code"] == 101 - assert res["message"] == 'Lack of "KB ID"' - - def test_invalid_url(self, document_app_module, monkeypatch): - module = document_app_module - monkeypatch.setattr(module, "request", _DummyRequest(form={"kb_id": "kb1", "name": "doc", "url": "not-a-url"})) - res = _run(module.web_crawl.__wrapped__()) + def test_invalid_url(self, document_rest_api_module, monkeypatch): + module = document_rest_api_module + monkeypatch.setattr( + module, + "request", + _DummyRequest(form={"name": "doc", "url": "not-a-url"}, args={"type": "web"}), + ) + res = _run(module.upload_document(dataset_id="kb1")) assert res["code"] == 101 assert res["message"] == "The URL format is invalid" - def test_invalid_kb_id_raises(self, document_app_module, monkeypatch): - module = document_app_module + def test_invalid_kb_id(self, document_rest_api_module, monkeypatch): + module = document_rest_api_module monkeypatch.setattr(module, "is_valid_url", lambda _url: True) monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (False, None)) - monkeypatch.setattr(module, "request", _DummyRequest(form={"kb_id": "missing", "name": "doc", "url": "http://example.com"})) - with pytest.raises(LookupError): - _run(module.web_crawl.__wrapped__()) + monkeypatch.setattr( + module, + "request", + _DummyRequest(form={"name": "doc", "url": "http://example.com"}, args={"type": "web"}), + ) + res = _run(module.upload_document(dataset_id="missing")) + assert res["code"] == 102 + assert "Can't find the dataset" in res["message"] - def test_no_permission(self, document_app_module, monkeypatch): - module = document_app_module - kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", parser_config={}) + def test_no_permission(self, document_rest_api_module, monkeypatch): + module = document_rest_api_module + kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", pipeline_id="pipe", parser_config={}) monkeypatch.setattr(module, "is_valid_url", lambda _url: True) monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, kb)) monkeypatch.setattr(module, "check_kb_team_permission", lambda *_args, **_kwargs: False) - monkeypatch.setattr(module, "request", _DummyRequest(form={"kb_id": "kb1", "name": "doc", "url": "http://example.com"})) - res = _run(module.web_crawl.__wrapped__()) + monkeypatch.setattr( + module, + "request", + _DummyRequest(form={"name": "doc", "url": "http://example.com"}, args={"type": "web"}), + ) + res = _run(module.upload_document(dataset_id="kb1")) assert res["code"] == 109 assert res["message"] == "No authorization." - def test_download_failure(self, document_app_module, monkeypatch): - module = document_app_module - kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", parser_config={}) + def test_download_failure(self, document_rest_api_module, monkeypatch): + module = document_rest_api_module + kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", pipeline_id="pipe", parser_config={}) monkeypatch.setattr(module, "is_valid_url", lambda _url: True) monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, kb)) monkeypatch.setattr(module, "check_kb_team_permission", lambda *_args, **_kwargs: True) monkeypatch.setattr(module, "html2pdf", lambda _url: None) - monkeypatch.setattr(module, "request", _DummyRequest(form={"kb_id": "kb1", "name": "doc", "url": "http://example.com"})) - res = _run(module.web_crawl.__wrapped__()) + monkeypatch.setattr( + module, + "request", + _DummyRequest(form={"name": "doc", "url": "http://example.com"}, args={"type": "web"}), + ) + res = _run(module.upload_document(dataset_id="kb1")) assert res["code"] == 100 assert "Download failure" in res["message"] - def test_unsupported_type(self, document_app_module, monkeypatch): - module = document_app_module - kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", parser_config={}) + def test_unsupported_type(self, document_rest_api_module, monkeypatch): + module = document_rest_api_module + kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", pipeline_id="pipe", parser_config={}) monkeypatch.setattr(module, "is_valid_url", lambda _url: True) monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, kb)) monkeypatch.setattr(module, "check_kb_team_permission", lambda *_args, **_kwargs: True) @@ -505,8 +515,12 @@ def test_unsupported_type(self, document_app_module, monkeypatch): monkeypatch.setattr(module.FileService, "get_kb_folder", lambda *_args, **_kwargs: {"id": "kb_root"}) monkeypatch.setattr(module.FileService, "new_a_file_from_kb", lambda *_args, **_kwargs: {"id": "kb_folder"}) monkeypatch.setattr(module, "duplicate_name", lambda *_args, **_kwargs: "bad.exe") - monkeypatch.setattr(module, "request", _DummyRequest(form={"kb_id": "kb1", "name": "doc", "url": "http://example.com"})) - res = _run(module.web_crawl.__wrapped__()) + monkeypatch.setattr( + module, + "request", + _DummyRequest(form={"name": "doc", "url": "http://example.com"}, args={"type": "web"}), + ) + res = _run(module.upload_document(dataset_id="kb1")) assert res["code"] == 100 assert "supported yet" in res["message"] @@ -519,9 +533,9 @@ def test_unsupported_type(self, document_app_module, monkeypatch): ("mail.eml", "doc", "email"), ], ) - def test_success_parser_overrides(self, document_app_module, monkeypatch, filename, filetype, expected_parser): - module = document_app_module - kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", parser_config={}) + def test_success_parser_overrides(self, document_rest_api_module, monkeypatch, filename, filetype, expected_parser): + module = document_rest_api_module + kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", pipeline_id="pipe", parser_config={}) captured = {} class _Storage: @@ -549,16 +563,20 @@ def insert_doc(doc): monkeypatch.setattr(module.settings, "STORAGE_IMPL", _Storage()) monkeypatch.setattr(module.DocumentService, "insert", insert_doc) monkeypatch.setattr(module.FileService, "add_file_from_kb", lambda *_args, **_kwargs: None) - monkeypatch.setattr(module, "request", _DummyRequest(form={"kb_id": "kb1", "name": "doc", "url": "http://example.com"})) + monkeypatch.setattr( + module, + "request", + _DummyRequest(form={"name": "doc", "url": "http://example.com"}, args={"type": "web"}), + ) - res = _run(module.web_crawl.__wrapped__()) + res = _run(module.upload_document(dataset_id="kb1")) assert res["code"] == 0 assert captured["doc"]["parser_id"] == expected_parser assert captured["put"] is True - def test_exception_path(self, document_app_module, monkeypatch): - module = document_app_module - kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", parser_config={}) + def test_exception_path(self, document_rest_api_module, monkeypatch): + module = document_rest_api_module + kb = SimpleNamespace(id="kb1", tenant_id="tenant1", name="kb", parser_id="parser", pipeline_id="pipe", parser_config={}) class _Storage: def obj_exist(self, *_args, **_kwargs): @@ -585,7 +603,11 @@ def insert_doc(_doc): monkeypatch.setattr(module.settings, "STORAGE_IMPL", _Storage()) monkeypatch.setattr(module.DocumentService, "insert", insert_doc) monkeypatch.setattr(module.FileService, "add_file_from_kb", lambda *_args, **_kwargs: None) - monkeypatch.setattr(module, "request", _DummyRequest(form={"kb_id": "kb1", "name": "doc", "url": "http://example.com"})) + monkeypatch.setattr( + module, + "request", + _DummyRequest(form={"name": "doc", "url": "http://example.com"}, args={"type": "web"}), + ) - res = _run(module.web_crawl.__wrapped__()) + res = _run(module.upload_document(dataset_id="kb1")) assert res["code"] == 100 diff --git a/web/src/hooks/use-document-request.ts b/web/src/hooks/use-document-request.ts index 2bc45d9dbe2..1f2e094eecb 100644 --- a/web/src/hooks/use-document-request.ts +++ b/web/src/hooks/use-document-request.ts @@ -16,11 +16,13 @@ import { import i18n from '@/locales/config'; import { EMPTY_METADATA_FIELD } from '@/pages/dataset/dataset/use-select-filters'; import kbService, { + createDocument, deleteDocument, documentFilter, listDocument, renameDocument, uploadDocument, + webCrawlDocument, } from '@/services/knowledge-service'; import { restAPIv1, webAPI } from '@/utils/api'; import { getSearchValue } from '@/utils/common-util'; @@ -458,10 +460,10 @@ export const useCreateDocument = () => { } = useMutation({ mutationKey: [DocumentApiAction.CreateDocument], mutationFn: async (name: string) => { - const { data } = await kbService.documentCreate({ - name, - kb_id: id, - }); + if (!id) { + return 500; + } + const data = await createDocument(id, name); if (data.code === 0) { if (page === 1) { queryClient.invalidateQueries({ @@ -525,13 +527,15 @@ export const useNextWebCrawl = () => { } = useMutation({ mutationKey: [DocumentApiAction.WebCrawl], mutationFn: async ({ name, url }: { name: string; url: string }) => { + if (!knowledgeId) { + return 500; + } const formData = new FormData(); formData.append('name', name); formData.append('url', url); - formData.append('kb_id', knowledgeId); - const ret = await kbService.webCrawl(formData); - const code = get(ret, 'data.code'); + const ret = await webCrawlDocument(knowledgeId, formData); + const code = get(ret, 'code'); if (code === 0) { message.success(i18n.t('message.uploaded')); } diff --git a/web/src/services/knowledge-service.ts b/web/src/services/knowledge-service.ts index b9473118302..a06c6ef669f 100644 --- a/web/src/services/knowledge-service.ts +++ b/web/src/services/knowledge-service.ts @@ -16,11 +16,11 @@ const { rmKb, kbList, documentChangeStatus, - documentCreate, documentChangeParser, documentThumbnails, retrievalTest, documentRun, + documentUpload, webCrawl, knowledgeGraph, listTagByKnowledgeIds, @@ -47,10 +47,6 @@ const methods = { url: documentChangeStatus, method: 'post', }, - documentCreate: { - url: documentCreate, - method: 'post', - }, documentRun: { url: documentRun, method: 'post', @@ -63,6 +59,10 @@ const methods = { url: documentThumbnails, method: 'get', }, + documentUpload: { + url: documentUpload, + method: 'post', + }, webCrawl: { url: webCrawl, method: 'post', @@ -303,6 +303,25 @@ export const uploadDocument = async (datasetId: string, formData: FormData) => { return response.data; }; +export const createDocument = async (datasetId: string, name: string) => { + const response = await request.post(api.documentCreate(datasetId), { + data: { name }, + }); + return response.data; +}; + +export const webCrawlDocument = async ( + datasetId: string, + formData: FormData, +) => { + const response = await axios.post(api.webCrawl(datasetId), formData, { + headers: { + [Authorization]: getAuthorization(), + }, + }); + return response.data; +}; + export const renameDocument = ( datasetId: string, documentId: string, diff --git a/web/src/utils/api.ts b/web/src/utils/api.ts index df797937b9e..b8b3605c947 100644 --- a/web/src/utils/api.ts +++ b/web/src/utils/api.ts @@ -118,7 +118,8 @@ export default { `${restAPIv1}/datasets/${datasetId}/documents`, documentRename: (datasetId: string, documentId: string) => `${restAPIv1}/datasets/${datasetId}/documents/${documentId}`, - documentCreate: `${webAPI}/document/create`, + documentCreate: (datasetId: string) => + `${restAPIv1}/datasets/${datasetId}/documents?type=empty`, documentRun: `${webAPI}/document/run`, documentChangeParser: `${webAPI}/document/change_parser`, documentThumbnails: `${webAPI}/document/thumbnails`, @@ -127,7 +128,8 @@ export default { `${webAPI}/document/download/${docId}`, documentUpload: (datasetId: string) => `${restAPIv1}/datasets/${datasetId}/documents`, - webCrawl: `${webAPI}/document/web_crawl`, + webCrawl: (datasetId: string) => + `${restAPIv1}/datasets/${datasetId}/documents?type=web`, uploadAndParse: `${webAPI}/document/upload_info`, setMeta: `${webAPI}/document/set_meta`, getDatasetFilter: (datasetId: string) =>