diff --git a/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/handler/EmbeddingHandler.java b/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/handler/EmbeddingHandler.java
index acf4980d16c..897043395d3 100644
--- a/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/handler/EmbeddingHandler.java
+++ b/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/handler/EmbeddingHandler.java
@@ -140,6 +140,11 @@ public class EmbeddingHandler implements IEmbeddingHandler {
*/
private static final Pattern PATTERN_MD_IMAGE = Pattern.compile("!\\[(.*?)]\\((.*?)\\)");
+ /**
+ * 匹配完整 HTML 表格块,避免分段时把 table/tr/td 切断。
+ */
+ private static final Pattern PATTERN_HTML_TABLE = Pattern.compile("(?is)
");
+
/**
* 向量化文档
*
@@ -184,14 +189,6 @@ public Map embeddingDocument(String knowId, AiragKnowledgeDoc do
EmbeddingStore embeddingStore = getEmbedStore(model);
// 删除旧数据
embeddingStore.removeAll(metadataKey(EMBED_STORE_METADATA_DOCID).isEqualTo(doc.getId()));
- // 分段器
- DocumentSplitter splitter = DocumentSplitters.recursive(DEFAULT_SEGMENT_SIZE, DEFAULT_OVERLAP_SIZE);
- // 分段并存储
- EmbeddingStoreIngestor ingestor = EmbeddingStoreIngestor.builder()
- .documentSplitter(splitter)
- .embeddingModel(embeddingModel)
- .embeddingStore(embeddingStore)
- .build();
Metadata metadata = Metadata.metadata(EMBED_STORE_METADATA_DOCID, doc.getId())
.put(EMBED_STORE_METADATA_KNOWLEDGEID, doc.getKnowledgeId())
.put(EMBED_STORE_METADATA_DOCNAME, FilenameUtils.getName(doc.getTitle()))
@@ -215,17 +212,61 @@ public Map embeddingDocument(String knowId, AiragKnowledgeDoc do
}
//update-end---author:wangshuai---date:2025-12-26---for:【QQYUN-14265】【AI】支持记忆---
Document from = Document.from(content, metadata);
- //update-begin---author:jeecg---date:2026-02-26---for:[#9374]【AI知识库】千帆向量报错,添加异常处理防止空指针
- try {
- ingestor.ingest(from);
- } catch (Exception e) {
- log.error("向量存储失败,请检查向量模型配置是否正确", e);
- throw new JeecgBootException("向量存储失败:" + e.getMessage());
- }
- //update-end---author:jeecg---date:2026-02-26---for:[#9374]【AI知识库】千帆向量报错,添加异常处理防止空指针
+ List segments = splitDocumentPreservingHtmlTables(from, DEFAULT_SEGMENT_SIZE, DEFAULT_OVERLAP_SIZE);
+ List embeddings = embeddingModel.embedAll(segments).content();
+ embeddingStore.addAll(embeddings, segments);
return metadata.toMap();
}
+ static List splitDocumentPreservingHtmlTables(Document document, int segmentSize, int overlapSize) {
+ String content = document.text();
+ if (oConvertUtils.isEmpty(content) || !PATTERN_HTML_TABLE.matcher(content).find()) {
+ return reindexSegments(DocumentSplitters.recursive(segmentSize, overlapSize).split(document), document.metadata());
+ }
+
+ List result = new ArrayList<>();
+ Matcher matcher = PATTERN_HTML_TABLE.matcher(content);
+ int currentIndex = 0;
+ while (matcher.find()) {
+ if (matcher.start() > currentIndex) {
+ appendSplitText(result, content.substring(currentIndex, matcher.start()), document.metadata(), segmentSize, overlapSize);
+ }
+ appendSegment(result, matcher.group(), document.metadata());
+ currentIndex = matcher.end();
+ }
+ if (currentIndex < content.length()) {
+ appendSplitText(result, content.substring(currentIndex), document.metadata(), segmentSize, overlapSize);
+ }
+ return reindexSegments(result, document.metadata());
+ }
+
+ private static void appendSplitText(List result, String text, Metadata metadata, int segmentSize, int overlapSize) {
+ if (oConvertUtils.isEmpty(text) || text.trim().isEmpty()) {
+ return;
+ }
+ Document textDocument = Document.from(text, metadata.copy());
+ result.addAll(DocumentSplitters.recursive(segmentSize, overlapSize).split(textDocument));
+ }
+
+ private static void appendSegment(List result, String text, Metadata metadata) {
+ if (oConvertUtils.isEmpty(text) || text.trim().isEmpty()) {
+ return;
+ }
+ result.add(TextSegment.from(text, metadata.copy()));
+ }
+
+ private static List reindexSegments(List segments, Metadata baseMetadata) {
+ List reindexedSegments = new ArrayList<>(segments.size());
+ for (int i = 0; i < segments.size(); i++) {
+ TextSegment segment = segments.get(i);
+ Metadata metadata = baseMetadata.copy();
+ metadata.putAll(segment.metadata().toMap());
+ metadata.put("index", i);
+ reindexedSegments.add(TextSegment.from(segment.text(), metadata));
+ }
+ return reindexedSegments;
+ }
+
/**
* 向量查询(多知识库)
*
@@ -548,8 +589,11 @@ private EmbeddingStore getEmbedStore(AiragModel model) {
* @date 2025/3/11 17:45
*/
public static AiModelOptions buildModelOptions(AiragModel model) {
+ // OneAPI 网关统一返回 OpenAPI 格式,映射为 OPENAI 客户端
+ String provider = LLMConsts.MODEL_PROVIDER_ONEAPI.equals(model.getProvider())
+ ? LLMConsts.MODEL_PROVIDER_OPENAI : model.getProvider();
AiModelOptions.AiModelOptionsBuilder modelOpBuilder = AiModelOptions.builder()
- .provider(model.getProvider())
+ .provider(provider)
.modelName(model.getModelName())
.baseUrl(model.getBaseUrl());
if (oConvertUtils.isObjectNotEmpty(model.getCredential())) {
diff --git a/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/service/impl/AiragKnowledgeDocServiceImpl.java b/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/service/impl/AiragKnowledgeDocServiceImpl.java
index be75bf62b7b..0874a5b9ea0 100644
--- a/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/service/impl/AiragKnowledgeDocServiceImpl.java
+++ b/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/service/impl/AiragKnowledgeDocServiceImpl.java
@@ -398,6 +398,11 @@ private static void unzipFile(Path zipFilePath, Path targetDir, Consumer a
throw new IOException("解压文件数量超限,可能是zip bomb攻击");
}
+ if (shouldSkipZipEntry(entry.getName())) {
+ log.info("跳过压缩包中的隐藏文件: {}", entry.getName());
+ continue;
+ }
+
Path newPath = safeResolve(targetDir, entry.getName());
if (entry.isDirectory()) {
@@ -424,6 +429,21 @@ private static void unzipFile(Path zipFilePath, Path targetDir, Consumer a
}
}
+ /**
+ * 过滤压缩包中的系统隐藏文件,例如 macOS 自动生成的 __MACOSX 和 ._ 文件。
+ */
+ static boolean shouldSkipZipEntry(String entryName) {
+ if (oConvertUtils.isEmpty(entryName)) {
+ return true;
+ }
+ String normalizedName = entryName.replace("\\", "/");
+ if (normalizedName.startsWith("__MACOSX/")) {
+ return true;
+ }
+ String fileName = Paths.get(normalizedName).getFileName().toString();
+ return fileName.startsWith("._") || fileName.equals(".DS_Store");
+ }
+
/**
* 安全解析路径,防止Zip Slip攻击
*