Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,11 @@ public class EmbeddingHandler implements IEmbeddingHandler {
*/
private static final Pattern PATTERN_MD_IMAGE = Pattern.compile("!\\[(.*?)]\\((.*?)\\)");

/**
* 匹配完整 HTML 表格块,避免分段时把 table/tr/td 切断。
*/
private static final Pattern PATTERN_HTML_TABLE = Pattern.compile("(?is)<table\\b.*?</table>");

/**
* 向量化文档
*
Expand Down Expand Up @@ -184,14 +189,6 @@ public Map<String, Object> embeddingDocument(String knowId, AiragKnowledgeDoc do
EmbeddingStore<TextSegment> embeddingStore = getEmbedStore(model);
// 删除旧数据
embeddingStore.removeAll(metadataKey(EMBED_STORE_METADATA_DOCID).isEqualTo(doc.getId()));
// 分段器
DocumentSplitter splitter = DocumentSplitters.recursive(DEFAULT_SEGMENT_SIZE, DEFAULT_OVERLAP_SIZE);
// 分段并存储
EmbeddingStoreIngestor ingestor = EmbeddingStoreIngestor.builder()
.documentSplitter(splitter)
.embeddingModel(embeddingModel)
.embeddingStore(embeddingStore)
.build();
Metadata metadata = Metadata.metadata(EMBED_STORE_METADATA_DOCID, doc.getId())
.put(EMBED_STORE_METADATA_KNOWLEDGEID, doc.getKnowledgeId())
.put(EMBED_STORE_METADATA_DOCNAME, FilenameUtils.getName(doc.getTitle()))
Expand All @@ -215,17 +212,61 @@ public Map<String, Object> embeddingDocument(String knowId, AiragKnowledgeDoc do
}
//update-end---author:wangshuai---date:2025-12-26---for:【QQYUN-14265】【AI】支持记忆---
Document from = Document.from(content, metadata);
//update-begin---author:jeecg---date:2026-02-26---for:[#9374]【AI知识库】千帆向量报错,添加异常处理防止空指针
try {
ingestor.ingest(from);
} catch (Exception e) {
log.error("向量存储失败,请检查向量模型配置是否正确", e);
throw new JeecgBootException("向量存储失败:" + e.getMessage());
}
//update-end---author:jeecg---date:2026-02-26---for:[#9374]【AI知识库】千帆向量报错,添加异常处理防止空指针
List<TextSegment> segments = splitDocumentPreservingHtmlTables(from, DEFAULT_SEGMENT_SIZE, DEFAULT_OVERLAP_SIZE);
List<Embedding> embeddings = embeddingModel.embedAll(segments).content();
embeddingStore.addAll(embeddings, segments);
return metadata.toMap();
}

static List<TextSegment> splitDocumentPreservingHtmlTables(Document document, int segmentSize, int overlapSize) {
String content = document.text();
if (oConvertUtils.isEmpty(content) || !PATTERN_HTML_TABLE.matcher(content).find()) {
return reindexSegments(DocumentSplitters.recursive(segmentSize, overlapSize).split(document), document.metadata());
}

List<TextSegment> result = new ArrayList<>();
Matcher matcher = PATTERN_HTML_TABLE.matcher(content);
int currentIndex = 0;
while (matcher.find()) {
if (matcher.start() > currentIndex) {
appendSplitText(result, content.substring(currentIndex, matcher.start()), document.metadata(), segmentSize, overlapSize);
}
appendSegment(result, matcher.group(), document.metadata());
currentIndex = matcher.end();
}
if (currentIndex < content.length()) {
appendSplitText(result, content.substring(currentIndex), document.metadata(), segmentSize, overlapSize);
}
return reindexSegments(result, document.metadata());
}

private static void appendSplitText(List<TextSegment> result, String text, Metadata metadata, int segmentSize, int overlapSize) {
if (oConvertUtils.isEmpty(text) || text.trim().isEmpty()) {
return;
}
Document textDocument = Document.from(text, metadata.copy());
result.addAll(DocumentSplitters.recursive(segmentSize, overlapSize).split(textDocument));
}

private static void appendSegment(List<TextSegment> result, String text, Metadata metadata) {
if (oConvertUtils.isEmpty(text) || text.trim().isEmpty()) {
return;
}
result.add(TextSegment.from(text, metadata.copy()));
}

private static List<TextSegment> reindexSegments(List<TextSegment> segments, Metadata baseMetadata) {
List<TextSegment> reindexedSegments = new ArrayList<>(segments.size());
for (int i = 0; i < segments.size(); i++) {
TextSegment segment = segments.get(i);
Metadata metadata = baseMetadata.copy();
metadata.putAll(segment.metadata().toMap());
metadata.put("index", i);
reindexedSegments.add(TextSegment.from(segment.text(), metadata));
}
return reindexedSegments;
}

/**
* 向量查询(多知识库)
*
Expand Down Expand Up @@ -548,8 +589,11 @@ private EmbeddingStore<TextSegment> getEmbedStore(AiragModel model) {
* @date 2025/3/11 17:45
*/
public static AiModelOptions buildModelOptions(AiragModel model) {
// OneAPI 网关统一返回 OpenAPI 格式,映射为 OPENAI 客户端
String provider = LLMConsts.MODEL_PROVIDER_ONEAPI.equals(model.getProvider())
? LLMConsts.MODEL_PROVIDER_OPENAI : model.getProvider();
AiModelOptions.AiModelOptionsBuilder modelOpBuilder = AiModelOptions.builder()
.provider(model.getProvider())
.provider(provider)
.modelName(model.getModelName())
.baseUrl(model.getBaseUrl());
if (oConvertUtils.isObjectNotEmpty(model.getCredential())) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,11 @@ private static void unzipFile(Path zipFilePath, Path targetDir, Consumer<File> a
throw new IOException("解压文件数量超限,可能是zip bomb攻击");
}

if (shouldSkipZipEntry(entry.getName())) {
log.info("跳过压缩包中的隐藏文件: {}", entry.getName());
continue;
}

Path newPath = safeResolve(targetDir, entry.getName());

if (entry.isDirectory()) {
Expand All @@ -424,6 +429,21 @@ private static void unzipFile(Path zipFilePath, Path targetDir, Consumer<File> a
}
}

/**
* 过滤压缩包中的系统隐藏文件,例如 macOS 自动生成的 __MACOSX 和 ._ 文件。
*/
static boolean shouldSkipZipEntry(String entryName) {
if (oConvertUtils.isEmpty(entryName)) {
return true;
}
String normalizedName = entryName.replace("\\", "/");
if (normalizedName.startsWith("__MACOSX/")) {
return true;
}
String fileName = Paths.get(normalizedName).getFileName().toString();
return fileName.startsWith("._") || fileName.equals(".DS_Store");
}

/**
* 安全解析路径,防止Zip Slip攻击
*
Expand Down