diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py index 7300a55a9f7..3b9bb3e0a5a 100644 --- a/api/apps/restful_apis/document_api.py +++ b/api/apps/restful_apis/document_api.py @@ -733,7 +733,7 @@ def list_docs(dataset_id, tenant_id): renamed_doc_list = [map_doc_keys(doc) for doc in payload] for doc_item in renamed_doc_list: if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX): - doc_item["thumbnail"] = f"/api/v1/documents/images/{dataset_id}-{doc_item['thumbnail']}" + doc_item["thumbnail"] = f"/api/v1/documents/{doc_item['id']}/thumbnail" if doc_item.get("source_type"): doc_item["source_type"] = doc_item["source_type"].split("/")[0] if doc_item["parser_config"].get("metadata"): @@ -1165,23 +1165,33 @@ async def update_metadata_config(tenant_id, dataset_id, document_id): @manager.route("/thumbnails", methods=["GET"]) # noqa: F821 +@login_required def list_thumbnails(): """ - Get thumbnails for documents. + Get thumbnails for documents the caller can access. --- tags: - Documents + security: + - ApiKeyAuth: [] parameters: - in: query name: doc_ids type: array required: true description: List of document IDs to get thumbnails for. + - in: header + name: Authorization + type: string + required: true + description: Bearer token for authentication. responses: 200: - description: Successfully retrieved thumbnails + description: Successfully retrieved thumbnails. Inaccessible IDs are + silently filtered out so the response cannot be used to + enumerate cross-tenant document IDs. 400: - description: Missing document IDs + description: Missing document IDs. """ from api.constants import IMG_BASE64_PREFIX from api.db.services.document_service import DocumentService @@ -1191,11 +1201,15 @@ def list_thumbnails(): return get_json_result(data=False, message='Lack of "Document ID"', code=RetCode.ARGUMENT_ERROR) try: - docs = DocumentService.get_thumbnails(doc_ids) + accessible_ids = [doc_id for doc_id in doc_ids if DocumentService.accessible(doc_id, current_user.id)] + if not accessible_ids: + return get_json_result(data={}) + + docs = DocumentService.get_thumbnails(accessible_ids) for doc_item in docs: if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX): - doc_item["thumbnail"] = f"/api/v1/documents/images/{doc_item['kb_id']}-{doc_item['thumbnail']}" + doc_item["thumbnail"] = f"/api/v1/documents/{doc_item['id']}/thumbnail" return get_json_result(data={d["id"]: d["thumbnail"] for d in docs}) except Exception as e: @@ -1615,20 +1629,30 @@ def _run_sync(): return get_error_data_result(message="Internal server error") +# Chunk image keys are xxhash64 hex digests (16 chars) generated in +# rag/svr/task_executor.py. Restricting the storage key to this shape +# prevents the endpoint from being coerced into serving arbitrary objects +# (e.g. raw documents) that happen to share the bucket. +_CHUNK_IMAGE_KEY_RE = re.compile(r"^[0-9a-f]{16}$") + + @manager.route("/documents/images/", methods=["GET"]) # noqa: F821 +@login_required async def get_document_image(image_id): """ - Get a document image by ID. + Get a chunk reference image by ID. --- tags: - Documents + security: + - ApiKeyAuth: [] parameters: - name: image_id in: path required: true schema: type: string - description: The image ID (format: bucket-name-image-name) + description: The chunk image ID (format: kb_id-chunk_xxhash) responses: 200: description: Image file @@ -1639,11 +1663,17 @@ async def get_document_image(image_id): format: binary """ try: - arr = image_id.split("-") - if len(arr) != 2: + kb_id, sep, key = image_id.rpartition("-") + if not sep or not kb_id: + logging.warning("get_document_image: malformed image_id=%r user_id=%s", image_id, current_user.id) + return get_data_error_result(message="Image not found.") + if not _CHUNK_IMAGE_KEY_RE.match(key): + logging.warning("get_document_image: invalid key shape image_id=%s user_id=%s", image_id, current_user.id) return get_data_error_result(message="Image not found.") - bkt, nm = image_id.split("-") - data = await thread_pool_exec(settings.STORAGE_IMPL.get, bkt, nm) + if not KnowledgebaseService.accessible(kb_id, current_user.id): + logging.warning("get_document_image: access denied image_id=%s user_id=%s", image_id, current_user.id) + return get_data_error_result(message="No authorization.") + data = await thread_pool_exec(settings.STORAGE_IMPL.get, kb_id, key) response = await make_response(data) response.headers.set("Content-Type", "image/JPEG") return response @@ -1651,6 +1681,54 @@ async def get_document_image(image_id): return server_error_response(e) +@manager.route("/documents//thumbnail", methods=["GET"]) # noqa: F821 +@login_required +async def get_document_thumbnail(doc_id): + """ + Get a document's thumbnail image. + --- + tags: + - Documents + security: + - ApiKeyAuth: [] + parameters: + - name: doc_id + in: path + required: true + schema: + type: string + description: The document ID. + responses: + 200: + description: Thumbnail image + content: + image/png: + schema: + type: string + format: binary + """ + try: + if not DocumentService.accessible(doc_id, current_user.id): + logging.warning("get_document_thumbnail: access denied doc_id=%s user_id=%s", doc_id, current_user.id) + return get_data_error_result(message="No authorization.") + e, doc = DocumentService.get_by_id(doc_id) + if not e: + logging.warning("get_document_thumbnail: document not found doc_id=%s", doc_id) + return get_data_error_result(message="Document not found.") + # Storage key shape mirrors the producer in api/db/services/file_service.py. + thumbnail_key = f"thumbnail_{doc.id}.png" + data = await thread_pool_exec(settings.STORAGE_IMPL.get, doc.kb_id, thumbnail_key) + if not data: + logging.info("get_document_thumbnail: missing thumbnail doc_id=%s key=%s", doc.id, thumbnail_key) + return get_data_error_result(message="Thumbnail not found.") + response = await make_response(data) + response.headers.set("Content-Type", "image/png") + return response + except Exception as e: + logging.exception("get_document_thumbnail failed doc_id=%s", doc_id) + return server_error_response(e) + + ARTIFACT_CONTENT_TYPES = { ".png": "image/png", ".jpg": "image/jpeg", diff --git a/test/testcases/test_web_api/test_common.py b/test/testcases/test_web_api/test_common.py index 170d530af1a..459982700f3 100644 --- a/test/testcases/test_web_api/test_common.py +++ b/test/testcases/test_web_api/test_common.py @@ -419,6 +419,11 @@ def document_get(auth, document_id, *, headers=HEADERS, data=None): return res +def document_thumbnail(auth, document_id, *, headers=HEADERS, data=None): + res = requests.get(url=f"{HOST_ADDRESS}/api/{VERSION}/documents/{document_id}/thumbnail", headers=headers, auth=auth, data=data) + return res + + def document_download(auth, attachment_id, *, ext="markdown", headers=HEADERS, data=None): res = requests.get( url=f"{HOST_ADDRESS}/api/{VERSION}/documents/{attachment_id}/download", diff --git a/test/testcases/test_web_api/test_document_app/test_document_metadata.py b/test/testcases/test_web_api/test_document_app/test_document_metadata.py index 71bf32d5658..126353713d5 100644 --- a/test/testcases/test_web_api/test_document_app/test_document_metadata.py +++ b/test/testcases/test_web_api/test_document_app/test_document_metadata.py @@ -23,6 +23,7 @@ document_infos, document_metadata_summary, document_metadata_update, + document_thumbnail, document_update_metadata_setting, bulk_upload_documents, delete_document, @@ -54,6 +55,15 @@ def test_infos_auth_invalid(self, invalid_auth, expected_code, expected_fragment assert res["code"] == expected_code, res assert expected_fragment in res["message"], res + @pytest.mark.p2 + @pytest.mark.parametrize("invalid_auth, expected_code, expected_fragment", INVALID_AUTH_CASES) + def test_thumbnail_auth_invalid(self, invalid_auth, expected_code, expected_fragment, add_document_func): + """E2E: anonymous and bad-token requests to the thumbnail endpoint must be rejected.""" + _, doc_id = add_document_func + res = document_thumbnail(invalid_auth, doc_id) + assert res.status_code == expected_code, res.text + assert expected_fragment in res.text, res.text + ## The inputs has been changed to add 'doc_ids' ## TODO: #@pytest.mark.p2 @@ -451,8 +461,8 @@ async def raise_error(*_args, **_kwargs): assert "download boom" in res["message"] - @pytest.mark.skip(reason="Moved to /api/v1/documents/images/") - def test_get_image_success_and_exception_unit(self, document_app_module, monkeypatch): + def test_get_document_image_authz_and_validation_unit(self, document_app_module, monkeypatch): + """Covers the auth/validation gates added for issue #14763.""" module = document_app_module class _Headers(dict): @@ -464,6 +474,36 @@ def __init__(self, data): self.data = data self.headers = _Headers() + # Malformed image_id (no hyphen split) -> "Image not found." before any + # storage call. + accessible_calls = [] + monkeypatch.setattr( + module.KnowledgebaseService, + "accessible", + lambda kb_id, user_id: accessible_calls.append((kb_id, user_id)) or True, + ) + res = _run(module.get_document_image("nohyphen")) + assert res["code"] == RetCode.DATA_ERROR + assert "Image not found." in res["message"] + + # Confused-deputy attempt: a thumbnail filename or raw doc filename in + # the key half MUST be rejected before any access or storage call so + # the endpoint can't be coerced into serving arbitrary objects. + for bad_key in ("thumbnail_abc.png", "report.pdf", "AAAAAAAAAAAAAAAA", "deadbeefcafebabe1"): + res = _run(module.get_document_image(f"kb1-{bad_key}")) + assert res["code"] == RetCode.DATA_ERROR + assert "Image not found." in res["message"] + assert accessible_calls == [], "key validation must run before access check" + + # Cross-tenant denial on a well-formed chunk image_id. + monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda kb_id, user_id: False) + res = _run(module.get_document_image("kb1-deadbeefcafebabe")) + assert res["code"] == RetCode.DATA_ERROR + assert "No authorization." in res["message"] + + # Authorized happy path. + monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda kb_id, user_id: True) + async def fake_thread_pool_exec(*_args, **_kwargs): return b"image-bytes" @@ -473,7 +513,7 @@ async def fake_make_response(data): monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec) monkeypatch.setattr(module, "make_response", fake_make_response) monkeypatch.setattr(module.settings, "STORAGE_IMPL", SimpleNamespace(get=lambda *_args, **_kwargs: b"image-bytes")) - res = _run(module.get_image("bucket-name")) + res = _run(module.get_document_image("kb1-deadbeefcafebabe")) assert isinstance(res, _ImageResponse) assert res.data == b"image-bytes" assert res.headers["Content-Type"] == "image/JPEG" @@ -483,10 +523,152 @@ async def raise_error(*_args, **_kwargs): monkeypatch.setattr(module, "thread_pool_exec", raise_error) monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)}) - res = _run(module.get_image("bucket-name")) + res = _run(module.get_document_image("kb1-deadbeefcafebabe")) assert res["code"] == 500 assert "image boom" in res["message"] + def test_get_document_thumbnail_authz_and_success_unit(self, document_app_module, monkeypatch): + module = document_app_module + + class _Headers(dict): + def set(self, key, value): + self[key] = value + + class _ImageResponse: + def __init__(self, data): + self.data = data + self.headers = _Headers() + + # Cross-tenant access denied -> "No authorization." before any storage + # lookup. Stub get_by_id to a valid document so this test only passes + # via the accessible() early return. + accessible_calls = [] + + def fake_accessible_denied(doc_id, user_id): + accessible_calls.append((doc_id, user_id)) + return False + + monkeypatch.setattr(module.DocumentService, "accessible", fake_accessible_denied) + monkeypatch.setattr( + module.DocumentService, + "get_by_id", + lambda _doc_id: (True, SimpleNamespace(id="doc1", kb_id="kb1")), + ) + res = _run(module.get_document_thumbnail("doc1")) + assert res["code"] == RetCode.DATA_ERROR + assert "No authorization." in res["message"] + assert accessible_calls == [("doc1", "user-1")] + + # Authorized but document missing. + monkeypatch.setattr(module.DocumentService, "accessible", lambda _doc_id, _user_id: True) + monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None)) + res = _run(module.get_document_thumbnail("doc1")) + assert res["code"] == RetCode.DATA_ERROR + assert "Document not found." in res["message"] + + # Authorized and document exists, but storage returns no thumbnail. + monkeypatch.setattr( + module.DocumentService, + "get_by_id", + lambda _doc_id: (True, SimpleNamespace(id="doc1", kb_id="kb1")), + ) + + async def fake_thread_pool_exec_empty(*_args, **_kwargs): + return b"" + + async def fake_make_response(data): + return _ImageResponse(data) + + monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec_empty) + monkeypatch.setattr(module, "make_response", fake_make_response) + monkeypatch.setattr(module.settings, "STORAGE_IMPL", SimpleNamespace(get=lambda *_args, **_kwargs: b"")) + res = _run(module.get_document_thumbnail("doc1")) + assert res["code"] == RetCode.DATA_ERROR + assert "Thumbnail not found." in res["message"] + + # Happy path: storage returns thumbnail bytes; storage key is derived + # from doc.id (not user input), and Content-Type is image/png. + storage_calls = [] + + def capture_storage_get(bucket, key, *_args, **_kwargs): + storage_calls.append((bucket, key)) + return b"thumb-bytes" + + async def fake_thread_pool_exec_ok(func, *args, **kwargs): + return func(*args, **kwargs) + + monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec_ok) + monkeypatch.setattr(module.settings, "STORAGE_IMPL", SimpleNamespace(get=capture_storage_get)) + res = _run(module.get_document_thumbnail("doc1")) + assert isinstance(res, _ImageResponse) + assert res.data == b"thumb-bytes" + assert res.headers["Content-Type"] == "image/png" + assert storage_calls == [("kb1", "thumbnail_doc1.png")] + + # Exception path. + async def raise_error(*_args, **_kwargs): + raise RuntimeError("thumb boom") + + monkeypatch.setattr(module, "thread_pool_exec", raise_error) + monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)}) + res = _run(module.get_document_thumbnail("doc1")) + assert res["code"] == 500 + assert "thumb boom" in res["message"] + + def test_list_thumbnails_filters_inaccessible_docs_unit(self, document_app_module, monkeypatch): + """`/thumbnails` must only return entries for docs the caller can access.""" + module = document_app_module + + monkeypatch.setattr(module, "request", _DummyRequest(args={"doc_ids": ["doc-ok", "doc-deny"]})) + + accessible_calls = [] + + def fake_accessible(doc_id, user_id): + accessible_calls.append((doc_id, user_id)) + return doc_id == "doc-ok" + + monkeypatch.setattr(module.DocumentService, "accessible", fake_accessible) + + thumbnail_calls = [] + + def fake_get_thumbnails(doc_ids): + thumbnail_calls.append(list(doc_ids)) + return [{"id": "doc-ok", "kb_id": "kb1", "thumbnail": "thumbnail_doc-ok.png"}] + + monkeypatch.setattr(module.DocumentService, "get_thumbnails", fake_get_thumbnails) + + res = module.list_thumbnails() + assert res["code"] == 0 + assert res["data"] == {"doc-ok": "/api/v1/documents/doc-ok/thumbnail"} + assert thumbnail_calls == [["doc-ok"]], "denied doc must not be passed to get_thumbnails" + assert ("doc-deny", "user-1") in accessible_calls + + # All denied -> empty map, no storage lookup at all. + thumbnail_calls.clear() + monkeypatch.setattr(module.DocumentService, "accessible", lambda *_a, **_kw: False) + res = module.list_thumbnails() + assert res["code"] == 0 + assert res["data"] == {} + assert thumbnail_calls == [] + + def test_list_thumbnails_missing_doc_ids_unit(self, document_app_module, monkeypatch): + """`/thumbnails` must reject empty input before any access check.""" + module = document_app_module + monkeypatch.setattr(module, "request", _DummyRequest(args={})) + + accessible_calls = [] + monkeypatch.setattr( + module.DocumentService, + "accessible", + lambda doc_id, user_id: accessible_calls.append((doc_id, user_id)) or True, + ) + + res = module.list_thumbnails() + assert res["code"] == RetCode.ARGUMENT_ERROR + assert 'Lack of "Document ID"' in res["message"] + assert accessible_calls == [], "no access checks should run when input is invalid" + + class TestDocumentBatchChangeStatus: @pytest.mark.p2 def test_change_status_partial_failure_matrix(self, WebApiAuth, add_dataset, ragflow_tmp_dir):