diff --git a/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/handler/EmbeddingHandler.java b/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/handler/EmbeddingHandler.java index acf4980d16c..897043395d3 100644 --- a/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/handler/EmbeddingHandler.java +++ b/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/handler/EmbeddingHandler.java @@ -140,6 +140,11 @@ public class EmbeddingHandler implements IEmbeddingHandler { */ private static final Pattern PATTERN_MD_IMAGE = Pattern.compile("!\\[(.*?)]\\((.*?)\\)"); + /** + * 匹配完整 HTML 表格块,避免分段时把 table/tr/td 切断。 + */ + private static final Pattern PATTERN_HTML_TABLE = Pattern.compile("(?is)"); + /** * 向量化文档 * @@ -184,14 +189,6 @@ public Map embeddingDocument(String knowId, AiragKnowledgeDoc do EmbeddingStore embeddingStore = getEmbedStore(model); // 删除旧数据 embeddingStore.removeAll(metadataKey(EMBED_STORE_METADATA_DOCID).isEqualTo(doc.getId())); - // 分段器 - DocumentSplitter splitter = DocumentSplitters.recursive(DEFAULT_SEGMENT_SIZE, DEFAULT_OVERLAP_SIZE); - // 分段并存储 - EmbeddingStoreIngestor ingestor = EmbeddingStoreIngestor.builder() - .documentSplitter(splitter) - .embeddingModel(embeddingModel) - .embeddingStore(embeddingStore) - .build(); Metadata metadata = Metadata.metadata(EMBED_STORE_METADATA_DOCID, doc.getId()) .put(EMBED_STORE_METADATA_KNOWLEDGEID, doc.getKnowledgeId()) .put(EMBED_STORE_METADATA_DOCNAME, FilenameUtils.getName(doc.getTitle())) @@ -215,17 +212,61 @@ public Map embeddingDocument(String knowId, AiragKnowledgeDoc do } //update-end---author:wangshuai---date:2025-12-26---for:【QQYUN-14265】【AI】支持记忆--- Document from = Document.from(content, metadata); - //update-begin---author:jeecg---date:2026-02-26---for:[#9374]【AI知识库】千帆向量报错,添加异常处理防止空指针 - try { - ingestor.ingest(from); - } catch (Exception e) { - log.error("向量存储失败,请检查向量模型配置是否正确", e); - throw new JeecgBootException("向量存储失败:" + e.getMessage()); - } - //update-end---author:jeecg---date:2026-02-26---for:[#9374]【AI知识库】千帆向量报错,添加异常处理防止空指针 + List segments = splitDocumentPreservingHtmlTables(from, DEFAULT_SEGMENT_SIZE, DEFAULT_OVERLAP_SIZE); + List embeddings = embeddingModel.embedAll(segments).content(); + embeddingStore.addAll(embeddings, segments); return metadata.toMap(); } + static List splitDocumentPreservingHtmlTables(Document document, int segmentSize, int overlapSize) { + String content = document.text(); + if (oConvertUtils.isEmpty(content) || !PATTERN_HTML_TABLE.matcher(content).find()) { + return reindexSegments(DocumentSplitters.recursive(segmentSize, overlapSize).split(document), document.metadata()); + } + + List result = new ArrayList<>(); + Matcher matcher = PATTERN_HTML_TABLE.matcher(content); + int currentIndex = 0; + while (matcher.find()) { + if (matcher.start() > currentIndex) { + appendSplitText(result, content.substring(currentIndex, matcher.start()), document.metadata(), segmentSize, overlapSize); + } + appendSegment(result, matcher.group(), document.metadata()); + currentIndex = matcher.end(); + } + if (currentIndex < content.length()) { + appendSplitText(result, content.substring(currentIndex), document.metadata(), segmentSize, overlapSize); + } + return reindexSegments(result, document.metadata()); + } + + private static void appendSplitText(List result, String text, Metadata metadata, int segmentSize, int overlapSize) { + if (oConvertUtils.isEmpty(text) || text.trim().isEmpty()) { + return; + } + Document textDocument = Document.from(text, metadata.copy()); + result.addAll(DocumentSplitters.recursive(segmentSize, overlapSize).split(textDocument)); + } + + private static void appendSegment(List result, String text, Metadata metadata) { + if (oConvertUtils.isEmpty(text) || text.trim().isEmpty()) { + return; + } + result.add(TextSegment.from(text, metadata.copy())); + } + + private static List reindexSegments(List segments, Metadata baseMetadata) { + List reindexedSegments = new ArrayList<>(segments.size()); + for (int i = 0; i < segments.size(); i++) { + TextSegment segment = segments.get(i); + Metadata metadata = baseMetadata.copy(); + metadata.putAll(segment.metadata().toMap()); + metadata.put("index", i); + reindexedSegments.add(TextSegment.from(segment.text(), metadata)); + } + return reindexedSegments; + } + /** * 向量查询(多知识库) * @@ -548,8 +589,11 @@ private EmbeddingStore getEmbedStore(AiragModel model) { * @date 2025/3/11 17:45 */ public static AiModelOptions buildModelOptions(AiragModel model) { + // OneAPI 网关统一返回 OpenAPI 格式,映射为 OPENAI 客户端 + String provider = LLMConsts.MODEL_PROVIDER_ONEAPI.equals(model.getProvider()) + ? LLMConsts.MODEL_PROVIDER_OPENAI : model.getProvider(); AiModelOptions.AiModelOptionsBuilder modelOpBuilder = AiModelOptions.builder() - .provider(model.getProvider()) + .provider(provider) .modelName(model.getModelName()) .baseUrl(model.getBaseUrl()); if (oConvertUtils.isObjectNotEmpty(model.getCredential())) { diff --git a/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/service/impl/AiragKnowledgeDocServiceImpl.java b/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/service/impl/AiragKnowledgeDocServiceImpl.java index be75bf62b7b..0874a5b9ea0 100644 --- a/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/service/impl/AiragKnowledgeDocServiceImpl.java +++ b/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/service/impl/AiragKnowledgeDocServiceImpl.java @@ -398,6 +398,11 @@ private static void unzipFile(Path zipFilePath, Path targetDir, Consumer a throw new IOException("解压文件数量超限,可能是zip bomb攻击"); } + if (shouldSkipZipEntry(entry.getName())) { + log.info("跳过压缩包中的隐藏文件: {}", entry.getName()); + continue; + } + Path newPath = safeResolve(targetDir, entry.getName()); if (entry.isDirectory()) { @@ -424,6 +429,21 @@ private static void unzipFile(Path zipFilePath, Path targetDir, Consumer a } } + /** + * 过滤压缩包中的系统隐藏文件,例如 macOS 自动生成的 __MACOSX 和 ._ 文件。 + */ + static boolean shouldSkipZipEntry(String entryName) { + if (oConvertUtils.isEmpty(entryName)) { + return true; + } + String normalizedName = entryName.replace("\\", "/"); + if (normalizedName.startsWith("__MACOSX/")) { + return true; + } + String fileName = Paths.get(normalizedName).getFileName().toString(); + return fileName.startsWith("._") || fileName.equals(".DS_Store"); + } + /** * 安全解析路径,防止Zip Slip攻击 *