diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 642ff8b456a..e195c561aab 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -41,7 +41,6 @@ from common.constants import SANDBOX_ARTIFACT_BUCKET, RetCode, TaskStatus from common.file_utils import get_project_base_directory from common.misc_utils import thread_pool_exec -from common.ssrf_guard import assert_url_is_safe from deepdoc.parser.html_parser import RAGFlowHtmlParser from rag.nlp import search @@ -214,7 +213,6 @@ def _run_sync(): except Exception as e: return server_error_response(e) - @manager.route("/get/", methods=["GET"]) # noqa: F821 @login_required async def get(doc_id): @@ -438,38 +436,3 @@ def read(self): txt = FileService.parse_docs(file_objs, current_user.id) return get_json_result(data=txt) - - -@manager.route("/upload_info", methods=["POST"]) # noqa: F821 -@login_required -async def upload_info(): - files = await request.files - file_objs = files.getlist("file") if files and files.get("file") else [] - url = request.args.get("url") - - if file_objs and url: - return get_json_result( - data=False, - message="Provide either multipart file(s) or ?url=..., not both.", - code=RetCode.BAD_REQUEST, - ) - - if not file_objs and not url: - return get_json_result( - data=False, - message="Missing input: provide multipart file(s) or url", - code=RetCode.BAD_REQUEST, - ) - - try: - if url and not file_objs: - assert_url_is_safe(url) - return get_json_result(data=FileService.upload_info(current_user.id, None, url)) - - if len(file_objs) == 1: - return get_json_result(data=FileService.upload_info(current_user.id, file_objs[0], None)) - - results = [FileService.upload_info(current_user.id, f, None) for f in file_objs] - return get_json_result(data=results) - except Exception as e: - return server_error_response(e) diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py index 3055ca87079..cbb3610453a 100644 --- a/api/apps/restful_apis/document_api.py +++ b/api/apps/restful_apis/document_api.py @@ -44,10 +44,75 @@ from common.constants import ParserType, RetCode, TaskStatus from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema from common.misc_utils import get_uuid, thread_pool_exec +from common.ssrf_guard import assert_url_is_safe from api.utils.file_utils import filename_type, thumbnail from api.utils.web_utils import html2pdf, is_valid_url from rag.nlp import search + +@manager.route("/documents/upload", methods=["POST"]) # noqa: F821 +@login_required +@add_tenant_id_to_kwargs +async def upload_info(tenant_id: str): + """ + Upload a document and get its parsed info. + --- + tags: + - Documents + security: + - ApiKeyAuth: [] + parameters: + - in: header + name: Authorization + type: string + required: true + description: Bearer token for authentication. + - in: formData + name: file + type: file + required: false + description: File to upload. + - in: query + name: url + type: string + required: false + description: URL to fetch file from. + responses: + 200: + description: Successful operation. + """ + files = await request.files + file_objs = files.getlist("file") if files and files.get("file") else [] + url = request.args.get("url") + + if file_objs and url: + return get_error_argument_result("Provide either multipart file(s) or ?url=..., not both.") + + if not file_objs and not url: + return get_error_argument_result("Missing input: provide multipart file(s) or url") + + try: + if url and not file_objs: + try: + assert_url_is_safe(url) + except ValueError as ve: + logging.warning("upload_info: rejected unsafe url: %s", ve) + return get_error_argument_result(str(ve)) + + data = await thread_pool_exec(FileService.upload_info, tenant_id, None, url) + return get_result(data=data) + + if len(file_objs) == 1: + data = await thread_pool_exec(FileService.upload_info, tenant_id, file_objs[0], None) + return get_result(data=data) + + results = [await thread_pool_exec(FileService.upload_info, tenant_id, f, None) for f in file_objs] + return get_result(data=results) + except Exception as e: + logging.exception("upload_info failed") + return server_error_response(e) + + @manager.route("/datasets//documents/", methods=["PATCH"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs diff --git a/test/testcases/test_web_api/test_common.py b/test/testcases/test_web_api/test_common.py index 46ec8974a55..abb695e5366 100644 --- a/test/testcases/test_web_api/test_common.py +++ b/test/testcases/test_web_api/test_common.py @@ -327,6 +327,53 @@ def upload_documents(auth, payload=None, files_path=None, *, filename_override=N f.close() +def upload_info(auth, files_path=None, *, url=None): + """ + Call the /api/v1/documents/upload endpoint to get upload info. + This is used to get file metadata before actually uploading to a dataset. + + Args: + auth: Authentication object + files_path: List of file paths to upload (optional) + url: URL to fetch file from (optional, can be used alone or with files_path to test mixed input rejection) + + Returns: + Response JSON with upload info + """ + url_endpoint = f"{HOST_ADDRESS}/api/{VERSION}/documents/upload" + + fields = [] + file_objects = [] + try: + if files_path: + for fp in files_path: + p = Path(fp) + f = p.open("rb") + fields.append(("file", (p.name, f))) + file_objects.append(f) + + # Add url as query parameter if provided + if url: + url_endpoint = f"{url_endpoint}?url={url}" + + # Handle empty fields (no files) - create empty MultipartEncoder + if not fields: + fields = [("empty", ("", ""))] + + m = MultipartEncoder(fields=fields) + + res = requests.post( + url=url_endpoint, + headers={"Content-Type": m.content_type}, + auth=auth, + data=m, + ) + return res.json() + finally: + for f in file_objects: + f.close() + + def create_document(auth, payload=None, *, headers=HEADERS, data=None): kb_id = payload.get("kb_id") if payload else None request_payload = dict(payload or {}) diff --git a/test/testcases/test_web_api/test_document_app/test_upload_info_unit.py b/test/testcases/test_web_api/test_document_app/test_upload_info_unit.py index 36c736166ac..443e79ef967 100644 --- a/test/testcases/test_web_api/test_document_app/test_upload_info_unit.py +++ b/test/testcases/test_web_api/test_document_app/test_upload_info_unit.py @@ -15,12 +15,12 @@ # import asyncio -from pathlib import Path -import importlib.util -import sys -from types import ModuleType import pytest +from test_common import upload_info +from configs import INVALID_API_TOKEN +from libs.auth import RAGFlowWebApiAuth +from utils.file_utils import create_txt_file class _AwaitableValue: @@ -61,81 +61,55 @@ def _run(coro): return asyncio.run(coro) -def _load_document_app_module(monkeypatch): - repo_root = Path(__file__).resolve().parents[4] - common_mod = ModuleType("common") - common_mod.bulk_upload_documents = lambda *_args, **_kwargs: [] - common_mod.delete_document = lambda *_args, **_kwargs: None - common_mod.list_documents = lambda *_args, **_kwargs: {"data": {"docs": []}} - monkeypatch.setitem(sys.modules, "common", common_mod) - module_path = repo_root / "test" / "testcases" / "test_web_api" / "test_document_app" / "conftest.py" - spec = importlib.util.spec_from_file_location("test_document_app_unit_conftest", module_path) - module = importlib.util.module_from_spec(spec) - sys.modules["test_document_app_unit_conftest"] = module - spec.loader.exec_module(module) - return module.document_app_module.__wrapped__(monkeypatch) - - -@pytest.mark.p2 -def test_upload_info_rejects_mixed_inputs(monkeypatch): - module = _load_document_app_module(monkeypatch) - monkeypatch.setattr(module, "assert_url_is_safe", lambda url: ("example.com", "93.184.216.34")) - files = _DummyFiles({"file": [_DummyFile("a.txt")]}) - monkeypatch.setattr(module, "request", _DummyRequest(files=files, args={"url": "https://example.com/a.txt"})) - - res = _run(module.upload_info()) - assert res["code"] == module.RetCode.BAD_REQUEST - assert "not both" in res["message"] - - -@pytest.mark.p2 -def test_upload_info_requires_file_or_url(monkeypatch): - module = _load_document_app_module(monkeypatch) - monkeypatch.setattr(module, "request", _DummyRequest(files=_DummyFiles())) - - res = _run(module.upload_info()) - assert res["code"] == module.RetCode.BAD_REQUEST - assert "Missing input" in res["message"] - +# ============================================================================ +# End-to-End Tests +# ============================================================================ @pytest.mark.p2 -def test_upload_info_supports_url_single_and_multiple_files(monkeypatch): - module = _load_document_app_module(monkeypatch) - monkeypatch.setattr(module, "assert_url_is_safe", lambda url: ("example.com", "93.184.216.34")) - captured = [] - - def fake_upload_info(user_id, file_obj, url=None): - captured.append((user_id, getattr(file_obj, "filename", None), url)) - if url is not None: - return {"kind": "url", "value": url} - return {"kind": "file", "value": file_obj.filename} - - monkeypatch.setattr(module.FileService, "upload_info", fake_upload_info) - - monkeypatch.setattr(module, "request", _DummyRequest(files=_DummyFiles(), args={"url": "https://example.com/a.txt"})) - res = _run(module.upload_info()) - assert res["code"] == 0 - assert res["data"] == {"kind": "url", "value": "https://example.com/a.txt"} - - monkeypatch.setattr(module, "request", _DummyRequest(files=_DummyFiles({"file": _DummyFile("single.txt")}))) - res = _run(module.upload_info()) - assert res["code"] == 0 - assert res["data"] == {"kind": "file", "value": "single.txt"} - - monkeypatch.setattr( - module, - "request", - _DummyRequest(files=_DummyFiles({"file": [_DummyFile("a.txt"), _DummyFile("b.txt")]})), - ) - res = _run(module.upload_info()) - assert res["code"] == 0 - assert res["data"] == [ - {"kind": "file", "value": "a.txt"}, - {"kind": "file", "value": "b.txt"}, - ] - assert captured == [ - ("user-1", None, "https://example.com/a.txt"), - ("user-1", "single.txt", None), - ("user-1", "a.txt", None), - ("user-1", "b.txt", None), - ] +class TestUploadInfoE2E: + """End-to-end tests for the /api/v1/documents/upload endpoint""" + + def test_upload_info_requires_file_or_url_e2e(self, WebApiAuth): + """Test that missing both file and url returns error""" + # Call without files and without url + res = upload_info(WebApiAuth) + assert res["code"] == 101, res + assert "Missing input" in res["message"] or "file" in res["message"].lower() or "url" in res["message"].lower() + + def test_upload_info_rejects_mixed_inputs_e2e(self, WebApiAuth, tmp_path): + """Test that providing both file and url returns error""" + # Create a file + fp = create_txt_file(tmp_path / "test.txt") + + # Call with both file and url - the API should reject this + res = upload_info(WebApiAuth, files_path=[fp], url="https://example.com/test.txt") + # The API should return an error when both file and url are provided + assert res["code"] == 101, res + assert "not both" in res["message"].lower() and "either" in res["message"].lower() + + def test_upload_info_supports_url_single_and_multiple_files_e2e(self, WebApiAuth, tmp_path): + """Test that the endpoint supports URL, single file, and multiple files""" + # Test with URL + # Note: Using a real URL might fail if the URL is not accessible + # For E2E testing, we test with actual file uploads + + # Test with single file + fp1 = create_txt_file(tmp_path / "single_file.txt") + res = upload_info(WebApiAuth, files_path=[fp1]) + assert res["code"] == 0, res + assert "data" in res, res + + # Test with multiple files + fp2 = create_txt_file(tmp_path / "file_a.txt") + fp3 = create_txt_file(tmp_path / "file_b.txt") + res = upload_info(WebApiAuth, files_path=[fp2, fp3]) + assert res["code"] == 0, res + assert "data" in res, res + # Should return a list for multiple files + if isinstance(res["data"], list): + assert len(res["data"]) == 2, res + + def test_upload_info_invalid_auth(self): + """Test that invalid authentication returns error""" + res = upload_info(RAGFlowWebApiAuth(INVALID_API_TOKEN), files_path=[]) + assert res["code"] == 401, res diff --git a/web/src/hooks/use-chat-request.ts b/web/src/hooks/use-chat-request.ts index 528b8ed2c71..d3c6550f223 100644 --- a/web/src/hooks/use-chat-request.ts +++ b/web/src/hooks/use-chat-request.ts @@ -492,9 +492,9 @@ export function useUploadAndParseFile() { formData.append('file', file); formData.append('conversation_id', conversationId || id); - const { data } = await chatService.uploadAndParse( + const { data } = await chatService.documentInfoUpload( { - url: api.uploadAndParse, + url: api.documentInfoUpload, signal: controller.current.signal, data: formData, onUploadProgress: ({ progress }) => { diff --git a/web/src/services/next-chat-service.ts b/web/src/services/next-chat-service.ts index 6f967fc55b9..c2551e06f9d 100644 --- a/web/src/services/next-chat-service.ts +++ b/web/src/services/next-chat-service.ts @@ -19,7 +19,7 @@ const { chatsTts, chatsMindmap, chatsRelatedQuestions, - uploadAndParse, + documentInfoUpload, fetchExternalChatInfo, } = api; @@ -92,9 +92,9 @@ const methods = { url: chatsRelatedQuestions, method: 'post', }, - uploadAndParse: { + documentInfoUpload: { method: 'post', - url: uploadAndParse, + url: documentInfoUpload, }, fetchExternalChatInfo: { url: fetchExternalChatInfo, diff --git a/web/src/utils/api.ts b/web/src/utils/api.ts index b8b3605c947..e1fde6fd5ff 100644 --- a/web/src/utils/api.ts +++ b/web/src/utils/api.ts @@ -130,7 +130,7 @@ export default { `${restAPIv1}/datasets/${datasetId}/documents`, webCrawl: (datasetId: string) => `${restAPIv1}/datasets/${datasetId}/documents?type=web`, - uploadAndParse: `${webAPI}/document/upload_info`, + documentInfoUpload: `${restAPIv1}/documents/upload`, setMeta: `${webAPI}/document/set_meta`, getDatasetFilter: (datasetId: string) => `${restAPIv1}/datasets/${datasetId}/documents?type=filter`,