Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 110 additions & 6 deletions api/apps/restful_apis/document_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@

from common import settings
from common.constants import ParserType, RetCode, TaskStatus, SANDBOX_ARTIFACT_BUCKET
from common.doc_store.doc_store_base import OrderByExpr
from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema
from common.misc_utils import get_uuid, thread_pool_exec
from api.utils.file_utils import filename_type, thumbnail
Expand Down Expand Up @@ -733,7 +734,7 @@ def list_docs(dataset_id, tenant_id):
renamed_doc_list = [map_doc_keys(doc) for doc in payload]
for doc_item in renamed_doc_list:
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
doc_item["thumbnail"] = f"/api/v1/documents/images/{dataset_id}-{doc_item['thumbnail']}"
doc_item["thumbnail"] = _document_thumbnail_url(doc_item["id"])
if doc_item.get("source_type"):
doc_item["source_type"] = doc_item["source_type"].split("/")[0]
if doc_item["parser_config"].get("metadata"):
Expand Down Expand Up @@ -835,6 +836,54 @@ def _get_docs_with_request(req, dataset_id:str):
return RetCode.SUCCESS, "", docs, total


def _document_thumbnail_url(doc_id: str) -> str:
return f"/api/v1/documents/{doc_id}/thumbnail"


def _apply_image_response_headers(response, filename: str, default_content_type: str):
ext = Path(filename).suffix.lower().lstrip(".") or None
content_type = CONTENT_TYPE_MAP.get(ext, default_content_type) if ext else default_content_type
apply_safe_file_response_headers(response, content_type, ext)


def _get_accessible_chunk_image_doc_id(image_id: str) -> str | None:
arr = image_id.split("-", 1)
if len(arr) != 2:
return None

kb_id, _ = arr
if not KnowledgebaseService.accessible(kb_id, current_user.id):
return None

e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
return None

index_name = search.index_name(kb.tenant_id)
if not settings.docStoreConn.index_exist(index_name, kb_id):
return None

result = settings.docStoreConn.search(
["doc_id"],
[],
{"img_id": image_id},
[],
OrderByExpr(),
0,
1,
index_name,
[kb_id],
)
fields = settings.docStoreConn.get_fields(result, ["doc_id"])
if not fields:
return None

doc_id = next(iter(fields.values())).get("doc_id")
if doc_id and DocumentService.accessible(doc_id, current_user.id):
return doc_id
return None
Comment on lines +849 to +884


def _get_doc_filters_with_request(req, dataset_id: str):
"""Get aggregated document filters with request parameters from a dataset."""
q = req.args
Expand Down Expand Up @@ -1165,6 +1214,7 @@ async def update_metadata_config(tenant_id, dataset_id, document_id):


@manager.route("/thumbnails", methods=["GET"]) # noqa: F821
@login_required
def list_thumbnails():
"""
Get thumbnails for documents.
Expand All @@ -1191,11 +1241,12 @@ def list_thumbnails():
return get_json_result(data=False, message='Lack of "Document ID"', code=RetCode.ARGUMENT_ERROR)

try:
docs = DocumentService.get_thumbnails(doc_ids)
authorized_doc_ids = [doc_id for doc_id in doc_ids if DocumentService.accessible(doc_id, current_user.id)]
docs = DocumentService.get_thumbnails(authorized_doc_ids)

for doc_item in docs:
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
doc_item["thumbnail"] = f"/api/v1/documents/images/{doc_item['kb_id']}-{doc_item['thumbnail']}"
doc_item["thumbnail"] = _document_thumbnail_url(doc_item["id"])

return get_json_result(data={d["id"]: d["thumbnail"] for d in docs})
except Exception as e:
Expand Down Expand Up @@ -1615,7 +1666,54 @@ def _run_sync():
return get_error_data_result(message="Internal server error")


@manager.route("/documents/<doc_id>/thumbnail", methods=["GET"]) # noqa: F821
@login_required
async def get_document_thumbnail(doc_id):
"""
Get a document thumbnail by document ID.
---
tags:
- Documents
security:
- ApiKeyAuth: []
parameters:
- name: doc_id
in: path
required: true
schema:
type: string
description: The document ID.
responses:
200:
description: Thumbnail image file
content:
image/png:
schema:
type: string
format: binary
"""
try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
return get_data_error_result(message="Document not found!")

if not KnowledgebaseService.accessible(doc.kb_id, current_user.id):
logging.warning("get_document_thumbnail: access denied for doc_id=%s user_id=%s", doc_id, current_user.id)
return get_data_error_result(message="Document not found!")

if not doc.thumbnail or doc.thumbnail.startswith(IMG_BASE64_PREFIX):
return get_data_error_result(message="Image not found.")

data = await thread_pool_exec(settings.STORAGE_IMPL.get, doc.kb_id, doc.thumbnail)
response = await make_response(data)
_apply_image_response_headers(response, doc.thumbnail, "image/png")
return response
Comment thread
coderabbitai[bot] marked this conversation as resolved.
except Exception as e:
return server_error_response(e)


@manager.route("/documents/images/<image_id>", methods=["GET"]) # noqa: F821
@login_required
async def get_document_image(image_id):
"""
Get a document image by ID.
Expand All @@ -1639,13 +1737,19 @@ async def get_document_image(image_id):
format: binary
"""
try:
arr = image_id.split("-")
arr = image_id.split("-", 1)
if len(arr) != 2:
return get_data_error_result(message="Image not found.")
bkt, nm = image_id.split("-")

doc_id = _get_accessible_chunk_image_doc_id(image_id)
if not doc_id:
logging.warning("get_document_image: access denied for image_id=%s user_id=%s", image_id, current_user.id)
return get_data_error_result(message="Image not found.")

bkt, nm = arr
data = await thread_pool_exec(settings.STORAGE_IMPL.get, bkt, nm)
response = await make_response(data)
response.headers.set("Content-Type", "image/JPEG")
_apply_image_response_headers(response, nm, "image/jpeg")
return response
except Exception as e:
return server_error_response(e)
Expand Down
5 changes: 5 additions & 0 deletions test/testcases/test_web_api/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,11 @@ def document_get(auth, document_id, *, headers=HEADERS, data=None):
return res


def document_thumbnail(auth, document_id, *, headers=HEADERS, data=None):
res = requests.get(url=f"{HOST_ADDRESS}/api/{VERSION}/documents/{document_id}/thumbnail", headers=headers, auth=auth, data=data)
return res


def document_download(auth, attachment_id, *, ext="markdown", headers=HEADERS, data=None):
res = requests.get(
url=f"{HOST_ADDRESS}/api/{VERSION}/documents/{attachment_id}/download",
Expand Down
Loading