Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 50 additions & 73 deletions api/apps/restful_apis/document_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,23 +720,25 @@ def list_docs(dataset_id, tenant_id):
logging.error(f"You don't own the dataset {dataset_id}. ")
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")

err_code, err_msg, docs, total = _get_docs_with_request(request, dataset_id)
if request.args.get("type") == "filter":
err_code, err_msg, payload, total = _get_doc_filters_with_request(request, dataset_id)
if err_code != RetCode.SUCCESS:
return get_data_error_result(code=err_code, message=err_msg)
return get_json_result(data={"total": total, "filter": payload})
Comment on lines +723 to +727
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Add logs for the new type=filter flow and validation exits.

Lines 723-727 and Lines 847-860 introduce new request flow and early-return branches without any logging. Please add at least debug entry + warning/error logs on validation failures for traceability.

Suggested patch
@@
 def list_docs(dataset_id, tenant_id):
@@
-    if request.args.get("type") == "filter":
+    if request.args.get("type") == "filter":
+        logging.debug("list_docs filter mode: dataset_id=%s, args=%s", dataset_id, dict(request.args))
         err_code, err_msg, payload, total = _get_doc_filters_with_request(request, dataset_id)
         if err_code != RetCode.SUCCESS:
+            logging.warning("list_docs filter mode validation failed: dataset_id=%s, error=%s", dataset_id, err_msg)
             return get_data_error_result(code=err_code, message=err_msg)
         return get_json_result(data={"total": total, "filter": payload})
@@
 def _get_doc_filters_with_request(req, dataset_id: str):
@@
     types = q.getlist("types")
     if types:
         invalid_types = {t for t in types if t not in VALID_FILE_TYPES}
         if invalid_types:
             msg = f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}"
+            logging.warning("invalid type filters: dataset_id=%s, invalid_types=%s", dataset_id, sorted(invalid_types))
             return RetCode.DATA_ERROR, msg, {}, 0
@@
     if run_status_converted:
         invalid_status = {s for s in run_status_converted if s not in run_status_text_to_numeric.values()}
         if invalid_status:
             msg = f"Invalid filter run status conditions: {', '.join(invalid_status)}"
+            logging.warning("invalid run filters: dataset_id=%s, invalid_status=%s", dataset_id, sorted(invalid_status))
             return RetCode.DATA_ERROR, msg, {}, 0

As per coding guidelines, **/*.py: Add logging for new flows.

Also applies to: 847-860

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@api/apps/restful_apis/document_api.py` around lines 723 - 727, Add logging
for the new "type=filter" request flow: at entry to the branch that checks
request.args.get("type") == "filter" emit a debug log (including dataset_id and
request.args) and when calling _get_doc_filters_with_request log before/after
the call; if err_code != RetCode.SUCCESS emit a warning or error log that
includes err_code and err_msg before returning get_data_error_result; also add
analogous debug and validation-failure logs around the other early-return branch
referenced (lines ~847-860) so all validation exits are logged for traceability.
Ensure logs reference the request flow and include identifiers like dataset_id
and any relevant request params, and use the existing logger used elsewhere in
document_api.py.


err_code, err_msg, payload, total = _get_docs_with_request(request, dataset_id)
if err_code != RetCode.SUCCESS:
return get_data_error_result(code=err_code, message=err_msg)

if request.args.get("type") == "filter":
docs_filter = _aggregate_filters(docs)
return get_json_result(data={"total": total, "filter": docs_filter})
else:
renamed_doc_list = [map_doc_keys(doc) for doc in docs]
for doc_item in renamed_doc_list:
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
doc_item["thumbnail"] = f"/api/v1/documents/images/{dataset_id}-{doc_item['thumbnail']}"
if doc_item.get("source_type"):
doc_item["source_type"] = doc_item["source_type"].split("/")[0]
if doc_item["parser_config"].get("metadata"):
doc_item["parser_config"]["metadata"] = turn2jsonschema(doc_item["parser_config"]["metadata"])
return get_json_result(data={"total": total, "docs": renamed_doc_list})
renamed_doc_list = [map_doc_keys(doc) for doc in payload]
for doc_item in renamed_doc_list:
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
doc_item["thumbnail"] = f"/api/v1/documents/images/{dataset_id}-{doc_item['thumbnail']}"
if doc_item.get("source_type"):
doc_item["source_type"] = doc_item["source_type"].split("/")[0]
if doc_item["parser_config"].get("metadata"):
doc_item["parser_config"]["metadata"] = turn2jsonschema(doc_item["parser_config"]["metadata"])
return get_json_result(data={"total": total, "docs": renamed_doc_list})


def _get_docs_with_request(req, dataset_id:str):
Expand Down Expand Up @@ -832,6 +834,40 @@ def _get_docs_with_request(req, dataset_id:str):

return RetCode.SUCCESS, "", docs, total


def _get_doc_filters_with_request(req, dataset_id: str):
"""Get aggregated document filters with request parameters from a dataset."""
q = req.args

keywords = q.get("keywords", "")

suffix = q.getlist("suffix")

types = q.getlist("types")
if types:
invalid_types = {t for t in types if t not in VALID_FILE_TYPES}
if invalid_types:
msg = f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}"
return RetCode.DATA_ERROR, msg, {}, 0

run_status = q.getlist("run")
run_status_text_to_numeric = {"UNSTART": "0", "RUNNING": "1", "CANCEL": "2", "DONE": "3", "FAIL": "4"}
run_status_converted = [run_status_text_to_numeric.get(v, v) for v in run_status]
if run_status_converted:
invalid_status = {s for s in run_status_converted if s not in run_status_text_to_numeric.values()}
if invalid_status:
msg = f"Invalid filter run status conditions: {', '.join(invalid_status)}"
return RetCode.DATA_ERROR, msg, {}, 0

docs_filter, total = DocumentService.get_filter_by_kb_id(
dataset_id,
keywords,
run_status_converted,
types,
suffix,
)
return RetCode.SUCCESS, "", docs_filter, total
Comment on lines +838 to +869
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | 🏗️ Heavy lift

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "1) Inspect filter service API surface"
rg -n --type=py -C3 '\bdef\s+get_filter_by_kb_id\s*\('

echo
echo "2) Compare list-docs parsing logic vs filter parsing logic"
rg -n --type=py -C2 '_get_docs_with_request|_get_doc_filters_with_request|create_time_from|metadata_condition|return_empty_metadata|doc_ids|name=doc_name'

echo
echo "3) Check tests/usages that combine type=filter with metadata/time/id constraints"
rg -n -C2 'type=filter|metadata_condition|create_time_from|create_time_to|\\bids\\b|\\bid\\b'

Repository: infiniflow/ragflow

Length of output: 50375


🏁 Script executed:

# Find where _get_doc_filters_with_request is called from (likely in routes)
rg -n --type=py '_get_doc_filters_with_request' -B 5 -A 10

# Find the endpoint/route definition that calls this function
rg -n --type=py 'def.*filter.*\(|@.*\.route.*filter' -A 5 api/apps/restful_apis/

# Check what parameters the API endpoint actually accepts from request
rg -n --type=py 'q\.get|q\.getlist' api/apps/restful_apis/document_api.py | head -40

Repository: infiniflow/ragflow

Length of output: 5846


🏁 Script executed:

# Check if metadata_condition is parsed in _get_doc_filters_with_request
sed -n '838,870p' api/apps/restful_apis/document_api.py

# Check what _parse_doc_id_filter_with_metadata does and if it's used
rg -n '_parse_doc_id_filter_with_metadata' api/apps/restful_apis/document_api.py -A 30 | head -50

# Check tests that combine type=filter with metadata or time params
rg -n 'type=filter.*create_time|type=filter.*metadata|type=filter.*\bid\b|type=filter.*\bids\b' test/ sdk/ -r 2>/dev/null | head -20

Repository: infiniflow/ragflow

Length of output: 4298


🏁 Script executed:

# Check the full implementation of _parse_doc_id_filter_with_metadata to understand metadata handling
sed -n '871,950p' api/apps/restful_apis/document_api.py

# Check DocumentService.get_filter_by_kb_id full implementation to see if it supports additional params
sed -n '185,250p' api/db/services/document_service.py

# Check if there are any tests that use type=filter with metadata/time/id params
rg -n 'type.*filter' test/testcases/ sdk/python/test/ -A 5 -B 2 | grep -E '(type|filter|metadata|create_time|\.get|\.post)' | head -40

Repository: infiniflow/ragflow

Length of output: 7736


🏁 Script executed:

# Verify if the filter endpoint query parameters are actually accepted/documented
# by checking the route definition for the documents endpoint
rg -n '@.*\.route.*documents' api/apps/restful_apis/document_api.py -B 5 -A 20 | head -80

# Check if there are any tests that combine type=filter with metadata or time constraints
rg -n 'document_filter.*metadata|document_filter.*create_time|document_filter.*\bid\b|document_filter.*\bids\b' test/ -r -A 5 -B 5

Repository: infiniflow/ragflow

Length of output: 3198


🏁 Script executed:

# Check the list_documents endpoint signature and documentation
sed -n '703,750p' api/apps/restful_apis/document_api.py

# Search more carefully for tests using document_filter
rg -n 'document_filter' test/ -A 3 -B 3 | grep -E '(metadata|create_time|id|ids|name)' | head -30

# Check the route documentation/swagger for the documents endpoint
rg -n 'parameters:' api/apps/restful_apis/document_api.py -A 100 | grep -A 100 'def list_documents' | head -80

Repository: infiniflow/ragflow

Length of output: 5877


🏁 Script executed:

# Look for any test that calls document_filter with parameters besides the basic ones
rg -n 'document_filter.*\{' test/ -A 2 -B 1 | head -50

# Check the route documentation to see what parameters are documented as supported
sed -n '650,750p' api/apps/restful_apis/document_api.py

Repository: infiniflow/ragflow

Length of output: 5530


🏁 Script executed:

# Check if _get_doc_filters_with_request is newly added or if it's part of old code
git log -p --all -S '_get_doc_filters_with_request' -- api/apps/restful_apis/document_api.py 2>/dev/null | head -50 || echo "Git history not available"

# Check logging in the file
rg -n 'logging\.' api/apps/restful_apis/document_api.py | wc -l

# Check if there's logging in _get_doc_filters_with_request
sed -n '838,870p' api/apps/restful_apis/document_api.py | grep -i log

Repository: infiniflow/ragflow

Length of output: 65


🏁 Script executed:

# Check if there's logging specifically in _get_doc_filters_with_request and _get_docs_with_request
echo "=== Logging in _get_docs_with_request ==="
sed -n '774,835p' api/apps/restful_apis/document_api.py | grep -i log

echo
echo "=== Logging in _get_doc_filters_with_request ==="
sed -n '838,870p' api/apps/restful_apis/document_api.py | grep -i log

# Check if there's ANY logging about which path is taken
sed -n '720,730p' api/apps/restful_apis/document_api.py

Repository: infiniflow/ragflow

Length of output: 759


🏁 Script executed:

# Verify the test actually passes ids to the filter endpoint and check if it expects them to be processed
sed -n '213,220p' test/testcases/test_web_api/test_document_app/test_document_metadata.py

# Check if there are any comments in the code explaining why filter endpoint doesn't support id/ids filtering
sed -n '720,870p' api/apps/restful_apis/document_api.py | grep -i 'filter\|ignore\|note\|todo\|fixme' -A 2 -B 2

# Verify the guidelines requirement about logging
grep -r 'Add logging' .

Repository: infiniflow/ragflow

Length of output: 4908


type=filter endpoint silently ignores metadata_condition, id/ids, name, and create_time_* query parameters.

The _get_doc_filters_with_request function (lines 838–869) only processes keywords, suffix, types, and run_status, then calls DocumentService.get_filter_by_kb_id() with only these parameters. In contrast, _get_docs_with_request (lines 774–835) respects metadata filtering, document ID constraints, time range filtering, and name filters via _parse_doc_id_filter_with_metadata and direct parameter parsing. This means when a caller passes ?type=filter&ids=... or ?type=filter&create_time_from=X, those constraints are silently dropped and the filter payload returns dataset-wide aggregations instead of the scoped counts—a behavioral inconsistency with the document listing path.

Additionally, this new flow lacks logging as required by coding guidelines (**/*.py: Add logging for new flows).

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@api/apps/restful_apis/document_api.py` around lines 838 - 869,
_get_doc_filters_with_request currently only reads keywords, suffix, types, and
run status and calls DocumentService.get_filter_by_kb_id, which causes it to
ignore metadata_condition, id/ids, name, and create_time_* filters and drop
scoping used by the document listing flow; update _get_doc_filters_with_request
to parse the same filters as _get_docs_with_request (reuse
_parse_doc_id_filter_with_metadata and the create_time/name parsing logic) and
pass the resulting id list, metadata_condition, name, and time-range parameters
into DocumentService.get_filter_by_kb_id (or extend that method if needed) so
the filter aggregation respects the caller’s constraints, and add appropriate
logging statements for the new flow consistent with the project logging
guidelines.


def _parse_doc_id_filter_with_metadata(req, kb_id):
"""Parse document ID filter based on metadata conditions from the request.

Expand Down Expand Up @@ -1053,65 +1089,6 @@ async def delete_documents(tenant_id, dataset_id):
logging.exception(e)
return get_error_data_result(message="Internal server error")


def _aggregate_filters(docs):
"""Aggregate filter options from a list of documents.

This function processes a list of document dictionaries and aggregates
available filter values for building filter UI on the client side.

Args:
docs (list): List of document dictionaries, each containing:
- id (str): Document ID
- suffix (str): File extension (e.g., "pdf", "docx")
- run (int): Parsing status code (0=UNSTART, 1=RUNNING, 2=CANCEL, 3=DONE, 4=FAIL)

Returns:
tuple: A tuple containing:
- dict: Aggregated filter options with keys:
- suffix: Dict mapping file extensions to document counts
- run_status: Dict mapping status codes to document counts
- metadata: Dict mapping metadata field names to value counts
- int: Total number of documents processed
"""
suffix_counter = {}
run_status_counter = {}
metadata_counter = {}
empty_metadata_count = 0

for doc in docs:
suffix_counter[doc.get("suffix")] = suffix_counter.get(doc.get("suffix"), 0) + 1
key_of_run = str(doc.get("run"))
run_status_counter[key_of_run] = run_status_counter.get(key_of_run, 0) + 1
meta_fields = doc.get("meta_fields", {})

if not meta_fields:
empty_metadata_count += 1
continue
has_valid_meta = False

for key, value in meta_fields.items():
values = value if isinstance(value, list) else [value]
for vv in values:
if vv is None:
continue
if isinstance(vv, str) and not vv.strip():
continue
sv = str(vv)
if key not in metadata_counter:
metadata_counter[key] = {}
metadata_counter[key][sv] = metadata_counter[key].get(sv, 0) + 1
has_valid_meta = True
if not has_valid_meta:
empty_metadata_count += 1

metadata_counter["empty_metadata"] = {"true": empty_metadata_count}
return {
"suffix": suffix_counter,
"run_status": run_status_counter,
"metadata": metadata_counter,
}

@manager.route("/datasets/<dataset_id>/documents/<document_id>/metadata/config", methods=["PUT"]) # noqa: F821
@login_required
@add_tenant_id_to_kwargs
Expand Down
Loading