diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md new file mode 100644 index 0000000000..5382f72554 --- /dev/null +++ b/.claude/CLAUDE.md @@ -0,0 +1,65 @@ + + + +# oh-my-claudecode - Intelligent Multi-Agent Orchestration + +You are running with oh-my-claudecode (OMC), a multi-agent orchestration layer for Claude Code. +Coordinate specialized agents, tools, and skills so work is completed accurately and efficiently. + + +- Delegate specialized work to the most appropriate agent. +- Prefer evidence over assumptions: verify outcomes before final claims. +- Choose the lightest-weight path that preserves quality. +- Consult official docs before implementing with SDKs/frameworks/APIs. + + + +Delegate for: multi-file changes, refactors, debugging, reviews, planning, research, verification. +Work directly for: trivial ops, small clarifications, single commands. +Route code to `executor` (use `model=opus` for complex work). Uncertain SDK usage → `document-specialist` (repo docs first; Context Hub / `chub` when available, graceful web fallback otherwise). + + + +`haiku` (quick lookups), `sonnet` (standard), `opus` (architecture, deep analysis). +Direct writes OK for: `~/.claude/**`, `.omc/**`, `.claude/**`, `CLAUDE.md`, `AGENTS.md`. + + + +Invoke via `/oh-my-claudecode:`. Trigger patterns auto-detect keywords. +Tier-0 workflows include `autopilot`, `ultrawork`, `ralph`, `team`, and `ralplan`. +Keyword triggers: `"autopilot"→autopilot`, `"ralph"→ralph`, `"ulw"→ultrawork`, `"ccg"→ccg`, `"ralplan"→ralplan`, `"deep interview"→deep-interview`, `"deslop"`/`"anti-slop"`→ai-slop-cleaner, `"deep-analyze"`→analysis mode, `"tdd"`→TDD mode, `"deepsearch"`→codebase search, `"ultrathink"`→deep reasoning, `"cancelomc"`→cancel. +Team orchestration is explicit via `/team`. +Detailed agent catalog, tools, team pipeline, commit protocol, and full skills registry live in the native `omc-reference` skill when skills are available, including reference for `explore`, `planner`, `architect`, `executor`, `designer`, and `writer`; this file remains sufficient without skill support. + + + +Verify before claiming completion. Size appropriately: small→haiku, standard→sonnet, large/security→opus. +If verification fails, keep iterating. + + + +Broad requests: explore first, then plan. 2+ independent tasks in parallel. `run_in_background` for builds/tests. +Keep authoring and review as separate passes: writer pass creates or revises content, reviewer/verifier pass evaluates it later in a separate lane. +Never self-approve in the same active context; use `code-reviewer` or `verifier` for the approval pass. +Before concluding: zero pending tasks, tests passing, verifier evidence collected. + + + +Hooks inject `` tags. Key patterns: `hook success: Success` (proceed), `[MAGIC KEYWORD: ...]` (invoke skill), `The boulder never stops` (ralph/ultrawork active). +Persistence: `` (7 days), `` (permanent). +Kill switches: `DISABLE_OMC`, `OMC_SKIP_HOOKS` (comma-separated). + + + +`/oh-my-claudecode:cancel` ends execution modes. Cancel when done+verified or blocked. Don't cancel if work incomplete. + + + +State: `.omc/state/`, `.omc/state/sessions/{sessionId}/`, `.omc/notepad.md`, `.omc/project-memory.json`, `.omc/plans/`, `.omc/research/`, `.omc/logs/` + + +## Setup + +Say "setup omc" or run `/oh-my-claudecode:omc-setup`. + + diff --git a/.claude/rules/project-knowledge/config-pitfalls.mdc b/.claude/rules/project-knowledge/config-pitfalls.mdc new file mode 100644 index 0000000000..2aa3c16b0c --- /dev/null +++ b/.claude/rules/project-knowledge/config-pitfalls.mdc @@ -0,0 +1,41 @@ +--- +description: LoongCollector 采集配置常见陷阱。编写或审查 pipeline config YAML 时参考。 +globs: + - "**/*.feature" + - "**/case.feature" + - "core/config/**" + - "test/e2e/**" +alwaysApply: false +--- +# LoongCollector 采集配置陷阱 + +## ExcutionTimeout 使配置变为一次性(onetime) + +`global.ExcutionTimeout` 存在于配置中时,**整个配置**被标记为 onetime 类型。 +只有注册了 `RegisterOnetimeInputCreator` 的插件才能在 onetime 配置中使用。 + +大部分输入插件(`input_forward`, `input_file`, `input_container_stdio`, `input_prometheus` 等)只注册了 `RegisterContinuousInputCreator`,在 onetime 配置中会报错: + +``` +failed to parse config:unsupported input plugin module:input_forward +``` + +### 判断逻辑 + +``` +global.ExcutionTimeout 存在 + → PipelineConfig::GetExpireTimeIfOneTime → mOnetimeExpireTime 被设置 + → CollectionConfig::IsOnetime() == true + → IsValidNativeInputPlugin(name, true) 在 ONETIME 注册表查找 + → 找不到 → "unsupported input plugin" +``` + +### 支持 onetime 的输入插件 + +查看 `PluginRegistry::LoadStaticPlugins()` 中调用 `RegisterOnetimeInputCreator` 的插件,如 `InputStaticFile`。 + +### 规则 + +- **持续运行的输入插件配置中不要使用 `ExcutionTimeout`** +- E2E 测试不需要 `ExcutionTimeout` 来控制超时,Go test 的 `-timeout` 参数已经提供了保护 +- 如果确实需要一次性采集,使用 `onetime_pipeline_config` 目录 + 支持 onetime 的输入插件 diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000000..c6802dc44e --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,5 @@ +{ + "enabledPlugins": { + "oh-my-claudecode@omc": true + } +} diff --git a/.claude/skills/code-review/SKILL.md b/.claude/skills/code-review/SKILL.md new file mode 100644 index 0000000000..76aca88ac1 --- /dev/null +++ b/.claude/skills/code-review/SKILL.md @@ -0,0 +1,451 @@ +--- +name: code-review +description: 在进行 Code Review 时,使用这个技能对 LoongCollector 变更进行安全导向、架构一致性优先的深度代码评审。 +metadata: + requires: + bins: + - python3 + - git + - gh +--- +# Code Review Agent Skill + +你是 LoongCollector 项目的高级代码审查助手。你的核心目标是发现真实缺陷、行为回归和风险点,而不是给出泛泛建议。 + +为避免假阳性,必须遵守: + +- 分析问题时必须包含充分上下文,不能只看局部 diff 就下结论。 +- 结论必须基于实际读取到的代码与变更,不允许基于记忆或猜测。 +- 先理解作者意图和端到端流程,再给出问题判断。 +- 遵循以下执行步骤,以实现代码修改后可以针对增量 Review,检查既有评审的修复情况。 + +## TOC + +- [Preflight(确保依赖工具存在)](#preflight确保依赖工具存在) +- [Local Branch Sync(确保代码新鲜)](#local-branch-sync确保代码新鲜) +- [Review Plan(开始前规划,避免遗漏)](#review-plan开始前规划避免遗漏) +- [脚本失败降级策略](#脚本失败降级策略) +- [Phase 1: Review Workspace & Incremental State(评审工作区与增量状态)](#phase-1-review-workspace--incremental-state评审工作区与增量状态) +- [Phase 2: Context Building(全局认知)](#phase-2-context-building全局认知) +- [Phase 3: Intent Analysis(意图理解)](#phase-3-intent-analysis意图理解) +- [牢记评估标准(无需输出)](#牢记评估标准无需输出) +- [Phase 4: Sub-agent Review(专项检查)](#phase-4-sub-agent-review专项检查) +- [Phase 5: Final Report(最终输出)](#phase-5-final-report最终输出) + +## Preflight(确保依赖工具存在) + +在进入 Phase 1 前,必须先执行以下命令并全部通过: + +- `python3 --version` +- `git rev-parse --is-inside-work-tree` +- `gh auth status` + +若任一命令失败,必须停止后续评审步骤,并按 `references/failure-playbook.md` 修复后重试。 + +## Local Branch Sync(确保代码新鲜) + +当复用本地 PR 分支做评审时,请在正式评审前先同步一次代码,避免使用过期工作副本: + +1. 读取远程 PR 当前 `headRefOid`(或分支当前 `HEAD` SHA)。 +2. 对应本地分支执行同步(如 `git fetch` + `git pull --ff-only` 或等价流程)。 +3. 在 `final-report.md` 顶部记录本轮评审使用的 `head` SHA,便于追溯。 + +## Review Plan(开始前规划,避免遗漏) + +在进入 Phase 1 细节步骤前,先在评审目录生成并维护 `review-plan.md`,用于“逐步执行 + 勾选校验”: + +1. 文件路径: + - PR:`code-review/pr-/review-plan.md` + - 分支:`code-review/branch-/review-plan.md` +2. 至少包含: + - 本轮评审对象(PR/分支、base/head SHA) + - 本轮待办清单(checkbox),按“**大项 + 子项**”拆分 + - 当前阶段标记(`in_progress`) + - 阻塞项与降级记录(若有) +3. 执行要求: + - 每完成一个步骤,必须同步勾选; + - 若中断或切换策略(如 `incremental -> full`),必须先更新计划再继续。 + - 不允许只写 Phase 名称而不拆子项(例如“Phase 1”必须细分到拉评论、更新状态、映射决策等子项)。 +4. 模板使用: + - `references/review-plan.template.md` 仅提供骨架; + - agent 必须根据本轮实际情况自行填写大项与子项。 + +## 脚本失败降级策略 + +若执行脚本报错,允许进入降级评审模式继续完成代码评审,但必须执行以下动作用于持续优化 skill: + +- 在 `code-review//script-failures.md` 记录失败信息(脚本名、命令、错误摘要、触发时间、回退策略)。 +- 评审继续时一律切换到 `full` 全量评审,并人工核对关键状态文件。 +- 在 `final-report.md` 增加 “Script Failure Feedback” 小节,说明失败影响范围与人工补偿动作。 +- 将失败信息反馈到技能维护通道(可用时使用 `mcp-feedback-enhanced`,不可用时至少落盘到 `script-failures.md` 供后续回收)。 + +## Phase 1: Review Workspace & Incremental State(评审工作区与增量状态) + +开始评审前,先初始化或复用仓库根目录下的评审工作区: + +- PR 评审目录:`code-review/pr-/` +- 分支评审目录:`code-review/branch-/` +- 目录不存在时必须创建,且保留历史评审轮次 + +该目录至少包含以下文件: + +- `meta.json`:评审对象与基线元数据(repo、base/head、review 时间、策略参数) +- `review-plan.md`:本轮执行计划与勾选进度(先计划再执行) +- `reviewed_commits.json`:已评审 commit 集合与映射记录 +- `intent-architecture-notes.md`:代码理解文档(Phase 3) +- `final-report.md`:最终报告(Phase 5) +- `comments/review-comments.json`:PR review comments 原始快照(仅此来源) +- `comments/comment-status.json`:评论状态判定结果(流程状态 + 技术状态) + +输入门禁: + +- 首次运行: + - 允许上述文件不存在; + - 必须先执行初始化脚本生成最小文件骨架,再继续后续步骤。 +- 非首次运行: + - 关键输入文件必须存在且 schema 合法; + - 若不合法,必须按 `references/failure-playbook.md` 执行“全量重建/重抓取”恢复流程,不允许手工拼接 JSON 继续运行。 + +模板与脚本目录(必须使用): + +- JSON 模板:`.claude/skills/code-review/references/` +- 流程脚本:`.claude/skills/code-review/scripts/` + +执行步骤(必须按顺序): + +1. 初始化评审目录与基础文件: + - PR:`python3 .claude/skills/code-review/scripts/init_review_workspace.py --repo-root --target-type pr --target-id --base-ref --head-ref --base-sha --head-sha ` + - 分支:`python3 .claude/skills/code-review/scripts/init_review_workspace.py --repo-root --target-type branch --target-id --base-ref --head-ref --base-sha --head-sha ` +2. 生成/更新 `review-plan.md`(可基于 `references/review-plan.template.md` 骨架,但必须补齐本轮大项/子项),并将当前阶段标记为 `Phase 1 in_progress`。 +3. 拉取 review comments 到 `comments/review-comments.json`: + - PR 评审:必须运行 `python3 .claude/skills/code-review/scripts/fetch_review_comments.py --repo-root --target-type pr --target-id `,仅 `PR review comments` + - 分支评审:可为空,或导入分支评审评论快照 + - `review-comments.json` 必须是标准对象结构(根对象含 `comments` 数组,元素含 `comment_id/path/line/side/body`);若不满足,视为上游脚本错误,必须先修正上游脚本。 + - 评论项必须包含 `thread_resolved` 布尔字段;流程状态仅由该字段决定(`true -> resolved`,`false -> open`)。 + - `snapshot/` 必须保留源码相对路径层级,禁止平铺文件名。示例:`snapshot/round-2/files/core/ebpf/protocol/redis/RedisParser.cpp`。若出现平铺结果,视为快照脚本错误或中途中断,必须重跑修正。 +4. 生成/更新评论状态文件: + - PR:`python3 .claude/skills/code-review/scripts/update_comment_status.py --repo-root --target-type pr --target-id ` + - 分支:`python3 .claude/skills/code-review/scripts/update_comment_status.py --repo-root --target-type branch --target-id ` + - 说明:这一步只同步结构与流程状态(`status_flow`)并保留历史 `status_tech`,不会自动做代码复核判定。 +5. 生成双维状态 Markdown 报告(表格): + - PR:`python3 .claude/skills/code-review/scripts/generate_comment_status_report.py --repo-root --target-type pr --target-id ` + - 分支:`python3 .claude/skills/code-review/scripts/generate_comment_status_report.py --repo-root --target-type branch --target-id ` + - 输出文件固定为:`comments/comment-status.md`(列:评论时间、文件、行号、作者、评论、流程状态、技术状态) +6. 计算增量映射与回退建议(`--base` 与 `--head` 必须传 commit SHA): + - PR:`python3 .claude/skills/code-review/scripts/incremental_review_mapper.py --repo-root --target-type pr --target-id --base --head --review-round ` + - 分支:`python3 .claude/skills/code-review/scripts/incremental_review_mapper.py --repo-root --target-type branch --target-id --base --head --review-round ` + - 当 `snapshot/latest.json` 存在时,映射脚本会计算 `snapshot_match_rate`,用于 rebase 冲突调整或 squash 合并后的增量决策辅助。 +7. 根据脚本输出中的 `recommendation` 执行: + - `incremental`:只评审 `need_review_commits` + - `partial`:优先评审 `need_review_commits`,并补审低置信 hunk + - `full`:执行全量评审,但必须做历史意见去重 + +8. 技术状态(`status_tech`)必须逐条复核,不允许猜测: + - 必读输入(按顺序): + 1) `comments/review-comments.json` + 2) `comments/comment-status.json` + 3) `reviewed_commits.json` + 4) 当前代码中与 comment `path` 对应文件 + 5) `snapshot/` 中同路径历史快照文件(若存在) + - 逐条处理规则(按 `comment_id`): + - 仅允许更新:`status_tech`、`mapped_finding_id`、`notes` + - `status_tech` 仅可取:`fixed|not-fixed|false-positive|partially-fixed` + - `notes` 必须写明“判定证据”,至少包含:对比文件、关键代码变化、结论原因 + - 每轮必须优先复核上一轮未终态条目(`not-fixed`、`partially-fixed`)。 + - 人工手动订正(支持): + - 若评论作者本人(当前 `gh` 登录账号)在该评论线程回复文本包含 `fixed`,状态同步为 `fixed`。 + - 若回复文本包含 `false-positive`(或 `false positive`),状态同步为 `false-positive`。 + - 手动订正由脚本在更新 `comment-status.json` 时自动吸收,并写入 `notes`。 + - 终态跳过规则(默认开启): + - 当前 `status_tech` 为 `fixed` 或 `false-positive` 的条目,本轮默认跳过技术复核。 + - 仅在以下条件触发时重开复核: + 1) 条目 `path` 在本轮 commit 范围内再次发生修改; + 2) 条目 `status_flow` 从 `resolved` 变为非 `resolved`; + 3) 人工显式指定强制复核(按 `comment_id` 列表)。 + - 输出要求: + - 更新后的 `comments/comment-status.json` + - 重新生成 `comments/comment-status.md` +9. 本轮评审收尾后,必须生成 snapshot 供下一轮增量决策使用: + - PR:`python3 .claude/skills/code-review/scripts/build_snapshot.py --repo-root --target-type pr --target-id --base --head --review-round ` + - 分支:`python3 .claude/skills/code-review/scripts/build_snapshot.py --repo-root --target-type branch --target-id --base --head --review-round ` + - 产物:`snapshot/round-/files/*`、`snapshot/round-/manifest.json`、`snapshot/latest.json` + +状态文件字段约束(必须遵守): + +- `reviewed_commits.json` 记录: + - `commit_sha` + - `patch_id`(用于 rebase 后精确映射) + - `review_round` + - `reviewed_at` + - `hunk_fingerprints`(数组) +- `comments/comment-status.json` 记录: + - `comment_id` + - `path` / `line` / `side` + - `body` + - `snippet`(可读代码片段) + - `snippet_fingerprint`(规范化片段 hash) + - `status_flow`(`open|resolved|wont-fix|deferred`) + - `status_tech`(`fixed|not-fixed|false-positive|partially-fixed`) + - `mapped_finding_id` + +说明: + +- `snippet_fingerprint` 定义为“规范化代码片段 + 文件路径 + 评论定位三元组(line/side/comment_id)”的稳定 hash,不能只用行号。 +- 允许人工修正 `status_flow` 与 `status_tech`,但不得删除历史记录。 + +增量评审策略(必须执行): + +1. 优先读取 `reviewed_commits.json`,只评审未覆盖的新变更。 +2. 若检测到 rebase/force-push,不可直接判定全量重审,先做映射再决策: + - L1(高置信):按 `patch-id` 映射旧 commit -> 新 commit,命中后继承“已评审”状态。 + - L2(中置信):按 `path + 规范化 hunk 片段 + hunk 上下文` 做指纹匹配,仅补审未命中 hunk。 + - L3(低置信):命中率低或冲突改写明显时,回退全量评审。 +3. 置信度门槛默认: + - `commit_map_rate >= 90%`:增量通过 + - `hunk_match_rate >= 80%`:局部补审 + - 否则全量回退 +4. 即使全量回退,也必须复用历史评论与 finding 去重,避免重复意见。 + +snapshot 在增量决策中的职责(必须遵守): + +1. `snapshot` 是增量决策辅助依据,不替代 git 主链路(`patch-id`/`hunk`)。 +2. rebase 且发生冲突改写时,若 commit/hunk 映射不足,可使用 `snapshot_match_rate` 辅助从 `full` 降到 `partial`。 +3. squash 合并导致 commit 边界丢失时,`snapshot_match_rate` 用于判断是否可继续增量评审。 +4. 若 `snapshot_match_rate` 不足阈值,仍必须 `full` 全量评审。 + +## Phase 2: Context Building(全局认知) + +开始评审前,必须先完成以下步骤: + +1. 读取 `../project-knowledge/SKILL.md`,建立系统架构和模块职责认知。 +2. 读取 `../project-knowledge/SKILL.md`,优先吸收: + - 公共能力入口(必须复用的 common/helper) + - 生命周期与资源释放不变量 + - 配置/环境变量约定(兼容大小写、默认值、废弃参数映射) + - 历史 review 高频问题(作为优先检查清单) +3. 读取并参考以下规范(按变更涉及范围选择): + - `../selfmonitor/SKILL.md`(自监控与告警相关改动必读) + - `../security-check/SKILL.md`(安全与合规相关改动必读) + - `../compile/SKILL.md`(涉及构建/编译链路时必读) +4. 基于 PR/分支变更列表,读取受影响文件的完整上下文(至少覆盖变更函数、调用方、定义处)。 +5. 若改动涉及 pipeline/runner/配置系统,必须先阅读以下代码再下结论: + - `core/application/Application.cpp`(主循环、配置扫描、退出顺序) + - `core/collection_pipeline/CollectionPipelineManager.cpp` + - `core/collection_pipeline/CollectionPipeline.cpp` + - `core/runner/ProcessorRunner.cpp` + - `core/runner/FlusherRunner.cpp` + - `core/config/watcher/PipelineConfigWatcher.cpp` + - `core/config/OnetimeConfigInfoManager.cpp` + - `core/file_server/FileServer.cpp` + - `core/file_server/checkpoint/CheckPointManager.cpp` + - `core/file_server/checkpoint/CheckpointManagerV2.cpp`(改动涉及 exactly-once 时) +6. 通过 MCP/`gh` 工具拉取评审上下文: + - PR 描述、提交历史、PR review comments、CI 状态 + - 最近约 10 个相关 PR 的 review 评论(提炼团队偏好) +7. 若可访问 Code 平台历史评论,优先抽样最近已合入 PR 的 review comments(建议>=30条)并做“模式交叉”: + - 把历史高频问题映射到本次变更文件,标记为“高风险检查项” + - 若与 `codebase-map` 冲突,以“最新代码事实 + 评论证据”更新结论 +8. 若发现历史约束或设计决策冲突,先记录“假设与证据”,后续在报告中显式说明。 + +## Phase 3: Intent Analysis(意图理解) + +完成上下文分析后,必须先产出“理解文档”,再进入问题列表。该文档是给开发者学习和理解代码用的,不能省略。 + +### Phase 3 输出要求(必须输出文档) + +必须输出一个独立文档(建议标题:`Code Review - Intent & Architecture Notes`),至少包含: + +- 作者意图:这个 PR/分支要解决什么问题,为什么现在做。 +- 端到端流程:从入口到出口,这次变更实际改变了哪些关键路径。 +- 影响范围:涉及哪些模块、接口、配置、状态文件、监控指标、告警链路。 +- 预期结果验证:改动是否达到目标,并给出证据与推理过程。 + +### Phase 3 落盘要求(必须写入 code-review 目录) + +必须将 Phase 3 文档写入仓库 `code-review/` 目录,禁止只在聊天中输出。 + +建议路径: + +- PR 评审:`code-review/pr-/intent-architecture-notes.md` +- 分支评审:`code-review/branch-/intent-architecture-notes.md`(`/` 替换为 `-`) + +要求: + +- 若目录不存在必须先创建。 +- 文档顶部必须包含评审对象元信息(PR号/分支名、commit范围、生成时间)。 + +### Mermaid 可视化要求(必须至少 2 张图) + +该理解文档必须包含 Mermaid 图,用于帮助学习与沟通。按改动内容选择,至少输出以下 2 类中的 2 张: + +- 架构图(模块关系 / 依赖边界) +- 流程图(关键执行路径) +- 时序图(组件交互、调用顺序、异步/重试行为) +- 数据结构图(关键状态对象、队列、checkpoint 主从关系) + +建议: + +- 小改动:至少 2 张图(流程 + 时序) +- 中大型改动:3-4 张图(架构 + 流程 + 时序 + 数据结构) + +注意: + +- 图必须与当前变更强相关,禁止画与本次 PR 无关的“百科全图”。 +- 图中节点命名使用代码中的真实组件/类型名称,避免抽象空词。 +- Mermaid 语法请遵循 `../mermaid/SKILL.md`。 + +## 牢记评估标准(无需输出) + +对每个变更文件和差异块,按以下 6 组标准检查: + +1. 业务与架构:目标达成、职责边界、拓扑与依赖、故障传播。 +2. 正确性与安全:边界检查、类型/异常处理、外部输入防御、安全合规。 +3. 并发与生命周期:线程/锁/队列正确退出、资源释放、状态恢复。 +4. 性能与资源:热路径复杂度、拷贝与分配、容量上限、日志开销。 +5. 稳定性与可观测:指标/日志/告警完整性与可定位性。 +6. 可维护性、兼容性与文档测试:可读性、向后兼容、文档与测试覆盖。 + +注意:以上不是“通用建议列表”,而是必须落到每个 sub-agent 的责任范围中执行(见下一节责任矩阵)。 + +## Phase 4: Sub-agent Review(专项检查) + +并行启动专项 sub-agent(建议 3-4 个并行,避免过度拆分)。每个 sub-agent 独立输出“发现的问题 + 证据”。 +每个 sub-agent 必须引用“牢记评估标准”中对应条目,不得只做口头判断。 +每个问题必须标注来源标准编号(例如:`[S3]` 表示“并发与生命周期”)。 + +### 责任矩阵(主责/次责) + +- Sub-agent A(逻辑与架构):主责 `S1`,次责 `S6` +- Sub-agent B(并发与生命周期):主责 `S3`,次责 `S5` +- Sub-agent C(安全稳定与性能):主责 `S2` + `S4`,次责 `S5` +- Sub-agent D(复用、兼容、文档测试):主责 `S6`,次责 `S1` + `S5` + +规则: + +- 主责标准必须全量覆盖;次责标准只需覆盖与本次改动直接相关的部分。 +- 若某问题跨多个标准,允许多标记(如 `[S2][S4]`)。 +- 不允许多个 agent 报告同一问题的重复结论;若重复,保留证据更完整的一条。 + +### Sub-agent A: 逻辑正确性与架构一致性 + +- 业务逻辑是否完整,是否存在边界漏处理、状态不一致、错误传播断裂。 +- 与 LoongCollector 架构约束是否一致(输入/处理/输出职责、Runner 模式、配置注册模式)。 +- 是否引入隐式依赖、循环依赖或故障传播不可观测的问题。 +- 重点覆盖评估标准:业务与架构、可维护性与兼容性。 + +### Sub-agent B: 并发、异步与生命周期 + +- 锁粒度、锁顺序、数据竞争、线程退出路径是否安全。 +- 回调/异步流程是否存在竞态、悬空引用、未处理失败路径。 +- 新增线程/定时任务是否可控停止,是否符合项目既有模式。 +- 重点覆盖评估标准:并发与生命周期、稳定性与可观测。 +- 生命周期/资源管理必查细则(必须逐项核对,重点是“正确释放与状态恢复”): + - 资源释放闭环: + - 每条路径(启动失败、热更新替换、删除配置、进程退出)都要核对资源闭环: + - 线程/future 可退出并被回收 + - queue pop 被 disable 后不再悬挂 + - 插件/Go pipeline 可停止且不残留引用 + - flush/batch/checkpoint 落盘语义与路径一致 + - 死锁与卡死风险: + - 锁顺序是否跨模块一致(pipeline manager / queue manager / file server)。 + - `WaitAllItemsInProcessFinished`、队列 `Wait/Trigger`、`HoldOn/Resume` 是否可能形成循环等待。 + - 长等待仅告警不终止的路径,是否可能导致永久卡住或退出超时。 + - 状态恢复正确性(核心): + - 热加载后是否恢复到“可继续采集+处理+发送”的一致状态,而非部分组件已恢复。 + - 文件采集 `Pause -> Dump -> ClearCache -> Resume` 后,handler/checkpoint/缓存三者是否一致。 + - 配置失败回滚时,旧 pipeline/task 是否保持可用,不出现半更新状态。 + - 顺序检查作为辅证(不是唯一判据): + - 仍需核对关键顺序(runner init 顺序、pipeline start/stop 顺序),但结论必须落到资源与状态结果。 + +### Sub-agent C: 安全、稳定性与性能 + +- 输入校验、异常处理、重试退避、资源释放(RAII)是否完备。 +- 右值/所有权:核验【调用点-传参-消费点】全链路,防止异常转移或冗余拷贝。 +- 是否存在热路径性能回退(重复计算、拷贝、容器增长失控、高频日志刷屏)。 +- 监控指标/告警是否完整,是否满足自监控规范。 +- 重点覆盖评估标准:正确性与安全、性能与资源、稳定性与可观测。 +- Checkpoint 必查细则(按改动范围选择): + - onetime checkpoint: + - 启动时 `LoadCheckpointFile()`,配置变化后 `DumpCheckpointFile()`。 + - 超时删除、`RemoveConfig()` 与 checkpoint 文件是否保持一致,避免残留条目导致错误恢复。 + - file checkpoint(v1): + - `FileServer::Start()` 是否仍保持 `LoadCheckPoint()` 在前、注册 handler 在后。 + - `Pause/Stop` 是否保证 `DumpCheckPointToLocal()`,以及失败场景是否有可定位日志/告警。 + - exactly-once checkpoint(v2): + - 主 checkpoint 与 range checkpoint 是否成对维护,避免孤儿 key。 + - 扫描与 GC 逻辑是否可能误删活跃 checkpoint,或导致恢复时状态不连续。 + +### Sub-agent D: 复用合规与文档一致性 + +- 是否重复实现了已有公共能力(优先复用 `core/common` 与现有工具函数)。 +- 注释与代码行为是否一致,TODO/FIXME 是否引入新技术债。 +- 插件配置或 `GetXxxParam` 改动是否同步更新 `docs/` 对应文档。 +- 重点覆盖评估标准:可维护性、兼容性与文档测试。 + +## Phase 5: Final Report(最终输出) + +Final Report 偏实用交付,可直接用于落地修复和平台流转。它与 Phase 2 的“理解文档”并行存在、互不替代。 + +### Phase 5 输出要求(实用导向) + +1. 先给 **Findings**,按严重度排序:`Critical` > `High` > `Medium` > `Low`。 +2. 每个问题必须包含可定位证据与可执行建议。 +3. 若未发现问题,明确写出“未发现阻断问题”,并列出残余风险与测试缺口。 +4. 最后补充 **Highlights**(正向实践),简洁即可。 +5. 必须包含 **Lifecycle Verdict**: + - 资源释放:`PASS/FAIL` + - 死锁/卡死风险:`PASS/FAIL` + - 状态恢复正确性:`PASS/FAIL` + - 每项附 1-3 条证据。 +6. 必须包含 **Fix Plan**(按优先级分组): + - 立即修复(阻断合入) + - 合入前修复 + - 可后续改进 +7. 必须包含 **Validation Plan**(修复后怎么验证): + - 需要跑哪些测试、观察哪些指标、验证哪些告警与恢复路径。 + +### Final Report 落盘要求(必须写入 code-review 目录) + +必须将 Final Report 写入仓库 `code-review/` 目录,禁止只在聊天中输出。 + +建议路径(与 Phase 2 同目录): + +- PR 评审:`code-review/pr-/final-report.md` +- 分支评审:`code-review/branch-/final-report.md`(`/` 替换为 `-`) + +要求: + +- `final-report.md` 必须引用对应的 `intent-architecture-notes.md`(相对路径链接)。 +- 若执行了平台发布(PR评论/Review),在文档末尾记录发布链接;若失败,记录失败原因与重试命令。 + +问题输出格式: + +```markdown +- Severity: + - File: [<路径>:<起始行号>](file://./<路径>#L<起始行号>) + - 问题: <一句话说明问题本质> + - 影响: <可能导致的错误行为/风险> + - 建议: <可直接执行的修复建议,必要时给最小代码片段> +``` + +额外要求: + +- 行号必须在最终输出前重新核对,确保可点击跳转。 +- 仅评论真实变更范围内的问题,避免“顺手重构建议”淹没核心缺陷。 +- 语气专业、直接、简洁,优先给出可验证结论。 + +### 平台发布(可选但推荐) + +若当前评审场景是 PR/分支评审,且工具可用,请在用户要求发布后自动化发布 Final Report: + +- 必须等待用户显式确认后才能执行发布。 +- 发布结构: + 1) **Inline Findings**:将可定位的问题逐条作为代码行内评论发布(不是回复到 PR 主评论)。 + 2) **PR 摘要评论**:将Final Report 摘要回复到 PR 主评论。 + - 必含:Critical/High/Medium/Low 数量统计表、Lifecycle PASS/FAIL 表格、Lifecycle FAIL 证据、总体结论、Highlights。 + - 不含:不重复贴全部 findings。 +- 发布工具: + - 优先使用 `gh` 工具提交结构化评审结果;若环境存在 GitHub MCP,可等价使用 MCP。 + - Inline 评论建议使用 `gh api repos///pulls//comments`(需包含 `commit_id/path/line/side/body`)。 + - 摘要评论建议使用 `gh pr comment --body-file `。 +- 若发布失败,必须在输出中说明失败原因并给出可复制的发布内容。 diff --git a/.claude/skills/code-review/references/comment-status.template.json b/.claude/skills/code-review/references/comment-status.template.json new file mode 100644 index 0000000000..1007f4cd19 --- /dev/null +++ b/.claude/skills/code-review/references/comment-status.template.json @@ -0,0 +1,23 @@ +{ + "version": "1.0", + "generated_at": "", + "review_target": { + "type": "pr", + "id": "" + }, + "status": [ + { + "comment_id": 0, + "path": "", + "line": 0, + "side": "RIGHT", + "body": "", + "snippet": "", + "snippet_fingerprint": "", + "status_flow": "open", + "status_tech": "not-fixed", + "mapped_finding_id": "", + "notes": "" + } + ] +} diff --git a/.claude/skills/code-review/references/failure-playbook.md b/.claude/skills/code-review/references/failure-playbook.md new file mode 100644 index 0000000000..00c2c19b5b --- /dev/null +++ b/.claude/skills/code-review/references/failure-playbook.md @@ -0,0 +1,65 @@ +# Code-Review Failure Playbook + +本文件是故障恢复决策表,目标是让 agent 在异常时做正确分流:自动恢复、回退流程、或请求人工介入。 + +## 总原则 + +- 优先判断当前是首次还是非首次。 +- 不手工拼接 JSON;恢复后必须回到标准流程节点继续执行。 +- Preflight 相关异常默认人工介入,其余优先自动回退到可重建节点。 +- 若脚本失败但不影响代码读取,允许降级继续评审,同时必须输出失败反馈记录。 + +## 场景 1:Preflight 失败(人工介入) + +- 触发信号:`python3 --version` / `git rev-parse --is-inside-work-tree` / `gh auth status` 任一失败 +- 决策:停止自动执行,提示用户介入检查环境与认证 +- 动作级别:`manual_required` +- 返回节点:Preflight(三条检查全部通过后再进入 Phase 1) + +## 场景 2:首次运行缺文件(正常入口,不是失败) + +- 触发信号:`code-review//` 不存在,或缺少 `meta.json` / `reviewed_commits.json` / `comments/*` +- 决策:判定为 Bootstrap,走初始化流程 +- 动作级别:`auto_recover` +- 返回节点:Phase 1-步骤 1(初始化)并顺序继续 + +## 场景 3:非首次运行时输入 schema 非法 + +- 触发信号:`invalid review-comments.json` / `invalid comment-status.json` +- 决策:放弃损坏中间态,回退到 Bootstrap 重建关键输入 +- 动作级别:`auto_recover` +- 返回节点:Phase 1-步骤 1(初始化)-> 步骤 2(拉取 comments)-> 步骤 3(重建状态) + +## 场景 4:commit 对象缺失 / commit 范围构建失败 + +- 触发信号:`missing base/head commit object` 或 `failed to build commit range` +- 决策:先自动同步 git 对象;若仍失败,转人工确认 base/head 选择 +- 动作级别:`auto_then_manual` +- 返回节点: + - 自动恢复成功:Phase 1-步骤 5(增量映射) + - 自动恢复失败:人工确认后重跑步骤 5 + +## 场景 5:snapshot 目录平铺 + +- 触发信号:`snapshot/` 下没有源码相对路径层级(仅平铺文件) +- 决策:视为快照过程异常,清空并重建快照 +- 动作级别:`auto_recover` +- 返回节点:快照生成步骤(完成后继续技术状态复核) + +## 场景 6:脚本运行异常但可继续评审 + +- 触发信号:任意脚本报错,但仓库代码与基础 git/gh 能力仍可读取 +- 决策:允许降级继续评审,避免流程阻塞;并强制记录失败反馈用于迭代 skill +- 动作级别:`degrade_continue` +- 必做动作: + - 写入 `code-review//script-failures.md`(脚本名、命令、错误、时间、补偿动作) + - 评审策略一律切换到 `full` 全量评审 + - 在 `final-report.md` 增加 “Script Failure Feedback” 小节 +- 返回节点:当前评审阶段(按降级策略继续) + +## 动作级别定义 + +- `manual_required`:必须人工介入后才能继续 +- `auto_recover`:agent 可自动恢复并继续流程 +- `auto_then_manual`:先自动尝试,失败后升级人工 +- `degrade_continue`:允许继续评审,但必须记录失败并反馈 diff --git a/.claude/skills/code-review/references/meta.template.json b/.claude/skills/code-review/references/meta.template.json new file mode 100644 index 0000000000..f1b824918f --- /dev/null +++ b/.claude/skills/code-review/references/meta.template.json @@ -0,0 +1,19 @@ +{ + "version": "1.0", + "repo": "", + "review_target": { + "type": "pr", + "id": "", + "base_ref": "", + "head_ref": "", + "base_sha": "", + "head_sha": "" + }, + "strategy": { + "commit_map_threshold": 0.9, + "hunk_match_threshold": 0.8, + "fallback_on_low_confidence": true + }, + "review_round": 1, + "generated_at": "" +} diff --git a/.claude/skills/code-review/references/review-plan.template.md b/.claude/skills/code-review/references/review-plan.template.md new file mode 100644 index 0000000000..81d4da2d60 --- /dev/null +++ b/.claude/skills/code-review/references/review-plan.template.md @@ -0,0 +1,30 @@ +# Review Plan + +- Review Target: `` +- Base SHA: `` +- Head SHA: `` +- Strategy: `` +- Current Phase: `` + +## Work Items + +> 请按本轮实际任务填写,不要只写 Phase 名称。 +> 建议格式:每个大项下拆 2~5 个子项,并用 checkbox 跟踪进度。 + +### + +- [ ] +- [ ] + +### + +- [ ] +- [ ] + +## Risks / Blockers + +- + +## Notes + +- 若策略切换(例如 `incremental -> full`),先更新本文件再继续执行。 diff --git a/.claude/skills/code-review/references/reviewed_commits.template.json b/.claude/skills/code-review/references/reviewed_commits.template.json new file mode 100644 index 0000000000..3d069b8334 --- /dev/null +++ b/.claude/skills/code-review/references/reviewed_commits.template.json @@ -0,0 +1,19 @@ +{ + "version": "1.0", + "review_rounds": [], + "commits": [ + { + "commit_sha": "", + "patch_id": "", + "review_round": 1, + "reviewed_at": "", + "hunk_fingerprints": [], + "files": [], + "mapping": { + "method": "direct|patch-id|hunk-similarity|none", + "mapped_from_commit": "", + "confidence": 0.0 + } + } + ] +} diff --git a/.claude/skills/code-review/scripts/build_snapshot.py b/.claude/skills/code-review/scripts/build_snapshot.py new file mode 100755 index 0000000000..f14ea55962 --- /dev/null +++ b/.claude/skills/code-review/scripts/build_snapshot.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +import argparse +import hashlib +import json +import re +import subprocess +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, List, Set, Tuple + + +def utc_now() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def sanitize_branch_name(branch_name: str) -> str: + return branch_name.replace("/", "-") + + +def resolve_target(args: argparse.Namespace) -> Tuple[str, str]: + if args.target_type and args.target_id: + target_type = args.target_type + target_id = args.target_id + elif args.pr_number is not None: + target_type = "pr" + target_id = str(args.pr_number) + elif args.branch_name: + target_type = "branch" + target_id = args.branch_name + else: + raise SystemExit("must provide either --target-type/--target-id or --pr-number or --branch-name") + if target_type not in {"pr", "branch"}: + raise SystemExit("target type must be pr or branch") + return target_type, target_id + + +def run_git(repo_root: Path, args: List[str]) -> str: + proc = subprocess.run(["git", *args], cwd=repo_root, text=True, capture_output=True, check=True) + return proc.stdout + + +def run_git_no_check(repo_root: Path, args: List[str]) -> subprocess.CompletedProcess: + return subprocess.run(["git", *args], cwd=repo_root, text=True, capture_output=True, check=False) + + +def normalize_file_content(text: str) -> str: + lines = [re.sub(r"\s+", " ", line.strip()) for line in text.splitlines()] + return "\n".join(lines) + + +def stable_hash(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def get_changed_files(repo_root: Path, base_sha: str, head_sha: str) -> List[str]: + out = run_git(repo_root, ["diff", "--name-only", f"{base_sha}..{head_sha}"]) + return sorted({line.strip() for line in out.splitlines() if line.strip()}) + + +def get_file_content_at_commit(repo_root: Path, commit_sha: str, path: str) -> str: + proc = run_git_no_check(repo_root, ["show", f"{commit_sha}:{path}"]) + if proc.returncode != 0: + return "" + return proc.stdout + + +def main() -> None: + parser = argparse.ArgumentParser(description="Build snapshot baseline for incremental review.") + parser.add_argument("--repo-root", required=True) + parser.add_argument("--target-type", choices=["pr", "branch"]) + parser.add_argument("--target-id") + parser.add_argument("--pr-number", type=int, help="PR number (legacy compatible)") + parser.add_argument("--branch-name", help="Branch name (legacy compatible)") + parser.add_argument("--base", required=True, help="Base commit SHA") + parser.add_argument("--head", required=True, help="Head commit SHA") + parser.add_argument("--review-round", required=True, type=int) + args = parser.parse_args() + + repo_root = Path(args.repo_root).resolve() + target_type, target_id_raw = resolve_target(args) + target_id_dir = sanitize_branch_name(target_id_raw) if target_type == "branch" else target_id_raw + review_dir = repo_root / "code-review" / f"{target_type}-{target_id_dir}" + + snapshot_root = review_dir / "snapshot" / f"round-{args.review_round}" + files_root = snapshot_root / "files" + files_root.mkdir(parents=True, exist_ok=True) + + changed_files = get_changed_files(repo_root, args.base, args.head) + manifest_files: List[Dict[str, object]] = [] + + for rel_path in changed_files: + content = get_file_content_at_commit(repo_root, args.head, rel_path) + if content == "": + # Deleted file at head; keep entry for audit but no content snapshot. + manifest_files.append( + {"path": rel_path, "exists_in_head": False, "raw_hash": "", "normalized_hash": "", "size": 0} + ) + continue + + out_path = files_root / rel_path + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(content, encoding="utf-8") + manifest_files.append( + { + "path": rel_path, + "exists_in_head": True, + "raw_hash": stable_hash(content), + "normalized_hash": stable_hash(normalize_file_content(content)), + "size": len(content.encode("utf-8")), + } + ) + + manifest = { + "version": "1.0", + "review_target": {"type": target_type, "id": target_id_raw}, + "review_round": args.review_round, + "base_sha": args.base, + "head_sha": args.head, + "generated_at": utc_now(), + "files": manifest_files, + } + manifest_path = snapshot_root / "manifest.json" + manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + + latest = { + "latest_round": args.review_round, + "manifest": str(manifest_path.relative_to(review_dir)), + "updated_at": utc_now(), + } + latest_path = review_dir / "snapshot" / "latest.json" + latest_path.parent.mkdir(parents=True, exist_ok=True) + latest_path.write_text(json.dumps(latest, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + + print( + json.dumps( + { + "review_target": {"type": target_type, "id": target_id_raw}, + "review_round": args.review_round, + "files": len(manifest_files), + "manifest": str(manifest_path), + }, + ensure_ascii=False, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/code-review/scripts/fetch_review_comments.py b/.claude/skills/code-review/scripts/fetch_review_comments.py new file mode 100755 index 0000000000..2bca6fca0b --- /dev/null +++ b/.claude/skills/code-review/scripts/fetch_review_comments.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +import argparse +import json +import subprocess +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, List, Tuple + + +def utc_now() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def run_cmd(args: List[str], cwd: Path) -> str: + proc = subprocess.run(args, cwd=cwd, text=True, capture_output=True, check=False) + if proc.returncode != 0: + raise SystemExit(f"command failed: {' '.join(args)}\n{proc.stderr.strip()}") + return proc.stdout + + +def sanitize_branch_name(branch_name: str) -> str: + return branch_name.replace("/", "-") + + +def resolve_target(args: argparse.Namespace) -> Tuple[str, str]: + if args.target_type and args.target_id: + target_type = args.target_type + target_id = args.target_id + elif args.pr_number is not None: + target_type = "pr" + target_id = str(args.pr_number) + elif args.branch_name: + target_type = "branch" + target_id = args.branch_name + else: + raise SystemExit("must provide either --target-type/--target-id or --pr-number or --branch-name") + if target_type not in {"pr", "branch"}: + raise SystemExit("target type must be pr or branch") + return target_type, target_id + + +def parse_name_with_owner(repo_root: Path) -> Tuple[str, str]: + out = run_cmd(["gh", "repo", "view", "--json", "nameWithOwner", "--jq", ".nameWithOwner"], repo_root).strip() + if "/" not in out: + raise SystemExit(f"invalid repository nameWithOwner: {out}") + owner, name = out.split("/", 1) + return owner, name + + +def get_viewer_login(repo_root: Path) -> str: + out = run_cmd(["gh", "api", "user", "--jq", ".login"], repo_root).strip() + return out + + +def run_graphql(repo_root: Path, owner: str, name: str, pr_number: int, cursor: str) -> Dict[str, Any]: + # Query review threads instead of plain review comments so we can + # persist thread-level resolution state deterministically. + query = """ +query($owner:String!, $name:String!, $number:Int!, $endCursor:String) { + repository(owner:$owner, name:$name) { + pullRequest(number:$number) { + reviewThreads(first:100, after:$endCursor) { + pageInfo { hasNextPage endCursor } + nodes { + isResolved + comments(first:100) { + nodes { + databaseId + body + path + line + originalLine + createdAt + updatedAt + author { login } + originalCommit { oid } + replyTo { databaseId } + } + } + } + } + } + } +} +""" + cmd = [ + "gh", + "api", + "graphql", + "-f", + f"query={query}", + "-F", + f"owner={owner}", + "-F", + f"name={name}", + "-F", + f"number={pr_number}", + ] + if cursor: + cmd.extend(["-F", f"endCursor={cursor}"]) + out = run_cmd(cmd, repo_root) + return json.loads(out) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Fetch PR review comments to stable schema file.") + parser.add_argument("--repo-root", required=True) + parser.add_argument("--target-type", choices=["pr", "branch"]) + parser.add_argument("--target-id") + parser.add_argument("--pr-number", type=int, help="PR number (legacy compatible)") + parser.add_argument("--branch-name", help="Branch name (legacy compatible)") + args = parser.parse_args() + + repo_root = Path(args.repo_root).resolve() + target_type, target_id_raw = resolve_target(args) + target_id_dir = sanitize_branch_name(target_id_raw) if target_type == "branch" else target_id_raw + review_dir = repo_root / "code-review" / f"{target_type}-{target_id_dir}" + comments_path = review_dir / "comments" / "review-comments.json" + comments_path.parent.mkdir(parents=True, exist_ok=True) + + if target_type != "pr": + payload = { + "version": "1.0", + "source": "branch_review_comments", + "fetched_at": utc_now(), + "review_target": {"type": target_type, "id": target_id_raw}, + "comments": [], + } + comments_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + print(json.dumps({"target": f"{target_type}-{target_id_raw}", "threads": 0, "comments": 0, "resolved_threads": 0})) + return + + owner, name = parse_name_with_owner(repo_root) + viewer_login = get_viewer_login(repo_root) + pr_number = int(target_id_raw) + + cursor = "" + has_next = True + comments: List[Dict[str, Any]] = [] + total_threads = 0 + resolved_threads = 0 + + while has_next: + # Paginate until all review threads are collected. + data = run_graphql(repo_root, owner, name, pr_number, cursor) + threads_obj = data["data"]["repository"]["pullRequest"]["reviewThreads"] + page_info = threads_obj["pageInfo"] + threads = threads_obj["nodes"] or [] + total_threads += len(threads) + for thread in threads: + is_resolved = bool(thread.get("isResolved", False)) + if is_resolved: + resolved_threads += 1 + thread_comments = thread.get("comments", {}).get("nodes", []) or [] + for c in thread_comments: + author = (c.get("author") or {}).get("login", "") + original_commit = (c.get("originalCommit") or {}).get("oid", "") + reply_to = (c.get("replyTo") or {}).get("databaseId") + # Use originalLine as a stable anchor because line can be null + # after code evolves on newer commits. + original_line = c.get("originalLine") + line = original_line if isinstance(original_line, int) else 0 + comments.append( + { + "comment_id": c.get("databaseId"), + "author": author, + "created_at": c.get("createdAt", ""), + "updated_at": c.get("updatedAt", ""), + "path": c.get("path", ""), + "line": line, + "side": "RIGHT", + "commit_id": original_commit, + "in_reply_to_id": reply_to, + "body": c.get("body", ""), + "thread_resolved": is_resolved, + } + ) + has_next = bool(page_info.get("hasNextPage")) + cursor = page_info.get("endCursor") if has_next else "" + + payload = { + "version": "1.0", + "source": "github_pr_review_comments", + "fetched_at": utc_now(), + "review_target": {"type": "pr", "id": target_id_raw}, + "viewer_login": viewer_login, + "comments": comments, + } + comments_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + print( + json.dumps( + { + "target": f"pr-{target_id_raw}", + "threads": total_threads, + "comments": len(comments), + "resolved_threads": resolved_threads, + }, + ensure_ascii=False, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/code-review/scripts/generate_comment_status_report.py b/.claude/skills/code-review/scripts/generate_comment_status_report.py new file mode 100755 index 0000000000..68e3aca8f5 --- /dev/null +++ b/.claude/skills/code-review/scripts/generate_comment_status_report.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +import argparse +import json +from pathlib import Path +from typing import Dict, List, Tuple + + +def sanitize_branch_name(branch_name: str) -> str: + return branch_name.replace("/", "-") + + +def resolve_target(args: argparse.Namespace) -> Tuple[str, str]: + if args.target_type and args.target_id: + target_type = args.target_type + target_id = args.target_id + elif args.pr_number is not None: + target_type = "pr" + target_id = str(args.pr_number) + elif args.branch_name: + target_type = "branch" + target_id = args.branch_name + else: + raise SystemExit("must provide either --target-type/--target-id or --pr-number or --branch-name") + if target_type not in {"pr", "branch"}: + raise SystemExit("target type must be pr or branch") + return target_type, target_id + + +def read_json(path: Path) -> Dict: + if not path.exists(): + raise SystemExit(f"missing file: {path}") + return json.loads(path.read_text(encoding="utf-8")) + + +def esc_cell(text: str) -> str: + return (text or "").replace("\n", " ").replace("|", "\\|").replace("`", "").strip() + + +def build_comment_meta_map(review_comments_payload: Dict) -> Dict[int, Dict]: + comments = review_comments_payload.get("comments", []) + meta_map: Dict[int, Dict] = {} + if not isinstance(comments, list): + return meta_map + for c in comments: + cid = c.get("comment_id") + if isinstance(cid, int): + meta_map[cid] = c + return meta_map + + +def build_markdown(target_type: str, target_id: str, items: List[Dict], comment_meta: Dict[int, Dict]) -> str: + lines = [] + lines.append(f"# Comment Status Report ({target_type}-{target_id})") + lines.append("") + lines.append(f"- Total: {len(items)}") + lines.append("") + lines.append("| 评论时间 | File | Line | 作者 | Comment | Flow | Tech |") + lines.append("|---|---|---:|---|---|---|---|") + for item in items: + cid = item.get("comment_id", "") + meta = comment_meta.get(cid, {}) + created_at = esc_cell(str(meta.get("created_at", ""))) + author = esc_cell(str(meta.get("author", ""))) + path = esc_cell(str(item.get("path", ""))) + line = item.get("line", 0) + body = esc_cell(str(item.get("body", ""))) + if len(body) > 160: + body = body[:157] + "..." + status_flow = esc_cell(str(item.get("status_flow", ""))) + status_tech = esc_cell(str(item.get("status_tech", ""))) + lines.append( + f"| {created_at} | `{path}` | {line} | {author} | {body} | {status_flow} | {status_tech} |" + ) + lines.append("") + return "\n".join(lines) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate markdown report from comment-status.json.") + parser.add_argument("--repo-root", required=True) + parser.add_argument("--target-type", choices=["pr", "branch"]) + parser.add_argument("--target-id") + parser.add_argument("--pr-number", type=int, help="PR number (legacy compatible)") + parser.add_argument("--branch-name", help="Branch name (legacy compatible)") + args = parser.parse_args() + + repo_root = Path(args.repo_root).resolve() + target_type, target_id_raw = resolve_target(args) + target_id_dir = sanitize_branch_name(target_id_raw) if target_type == "branch" else target_id_raw + review_dir = repo_root / "code-review" / f"{target_type}-{target_id_dir}" + + status_path = review_dir / "comments" / "comment-status.json" + review_comments_path = review_dir / "comments" / "review-comments.json" + report_path = review_dir / "comments" / "comment-status.md" + + payload = read_json(status_path) + if not isinstance(payload, dict) or not isinstance(payload.get("status"), list): + raise SystemExit("invalid comment-status.json: root must be object and `status` must be list") + review_comments_payload = read_json(review_comments_path) + if not isinstance(review_comments_payload, dict): + raise SystemExit("invalid review-comments.json: root must be object") + + comment_meta = build_comment_meta_map(review_comments_payload) + markdown = build_markdown(target_type, target_id_raw, payload["status"], comment_meta) + report_path.write_text(markdown + "\n", encoding="utf-8") + print(str(report_path)) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/code-review/scripts/incremental_review_mapper.py b/.claude/skills/code-review/scripts/incremental_review_mapper.py new file mode 100755 index 0000000000..d8d0a558d6 --- /dev/null +++ b/.claude/skills/code-review/scripts/incremental_review_mapper.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 +import argparse +import hashlib +import json +import re +import subprocess +from pathlib import Path +from dataclasses import dataclass +from datetime import datetime, timezone +from typing import Dict, List, Optional, Set, Tuple + + +def run_git(repo_root: Path, args: List[str]) -> str: + result = subprocess.run( + ["git", *args], + cwd=repo_root, + text=True, + capture_output=True, + check=True, + ) + return result.stdout + + +def run_git_no_check(repo_root: Path, args: List[str]) -> subprocess.CompletedProcess: + return subprocess.run( + ["git", *args], + cwd=repo_root, + text=True, + capture_output=True, + check=False, + ) + + +def utc_now() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def normalize_code_line(line: str) -> str: + return re.sub(r"\s+", " ", line.strip()) + + +def stable_hash(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def normalize_file_content(text: str) -> str: + # Keep line boundaries but normalize whitespace noise for robust matching. + lines = [re.sub(r"\s+", " ", line.strip()) for line in text.splitlines()] + return "\n".join(lines) + + +def compute_patch_id(repo_root: Path, commit_sha: str) -> str: + patch_text = run_git(repo_root, ["show", "--pretty=format:", "--no-color", commit_sha]) + proc = subprocess.run( + ["git", "patch-id", "--stable"], + cwd=repo_root, + text=True, + input=patch_text, + capture_output=True, + check=True, + ) + output = proc.stdout.strip() + return output.split()[0] if output else "" + + +def get_commit_files(repo_root: Path, commit_sha: str) -> List[str]: + out = run_git(repo_root, ["show", "--pretty=format:", "--name-only", "--no-color", commit_sha]) + return sorted({line.strip() for line in out.splitlines() if line.strip()}) + + +def get_file_content_at_commit(repo_root: Path, commit_sha: str, path: str) -> Optional[str]: + proc = run_git_no_check(repo_root, ["show", f"{commit_sha}:{path}"]) + if proc.returncode != 0: + return None + return proc.stdout + + +def load_latest_snapshot_map(review_dir: Path) -> Dict[str, str]: + latest_path = review_dir / "snapshot" / "latest.json" + if not latest_path.exists(): + return {} + try: + latest = json.loads(latest_path.read_text(encoding="utf-8")) + except Exception: + return {} + manifest_rel = latest.get("manifest") + if not isinstance(manifest_rel, str) or not manifest_rel: + return {} + manifest_path = review_dir / manifest_rel + if not manifest_path.exists(): + return {} + try: + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + except Exception: + return {} + mapping: Dict[str, str] = {} + for item in manifest.get("files", []): + path = item.get("path") + n_hash = item.get("normalized_hash") + if isinstance(path, str) and isinstance(n_hash, str): + mapping[path] = n_hash + return mapping + + +def compute_snapshot_match_rate( + repo_root: Path, head_sha: str, changed_files: Set[str], snapshot_map: Dict[str, str] +) -> Optional[float]: + if not snapshot_map or not changed_files: + return None + overlap = [p for p in changed_files if p in snapshot_map] + if not overlap: + return None + matched = 0 + for path in overlap: + content = get_file_content_at_commit(repo_root, head_sha, path) + if content is None: + continue + current_hash = stable_hash(normalize_file_content(content)) + if current_hash == snapshot_map[path]: + matched += 1 + return matched / len(overlap) + + +def parse_hunk_fingerprints(repo_root: Path, commit_sha: str) -> List[str]: + patch = run_git(repo_root, ["show", "--pretty=format:", "--no-color", "-U3", commit_sha]) + lines = patch.splitlines() + file_path = "" + hunk_header = "" + hunk_lines: List[str] = [] + fps: List[str] = [] + + def flush() -> None: + nonlocal hunk_lines, hunk_header + if not hunk_lines: + return + key = file_path + "\n" + hunk_header + "\n" + "\n".join(hunk_lines) + fps.append(stable_hash(key)) + hunk_lines = [] + hunk_header = "" + + for line in lines: + if line.startswith("diff --git "): + flush() + m = re.search(r" b/(.+)$", line) + file_path = m.group(1) if m else "" + continue + if line.startswith("@@"): + flush() + hunk_header = line + continue + if line.startswith("+") or line.startswith("-"): + if line.startswith("+++") or line.startswith("---"): + continue + hunk_lines.append(normalize_code_line(line[1:])) + + flush() + return sorted(set(fps)) + + +def jaccard(a: Set[str], b: Set[str]) -> float: + if not a and not b: + return 1.0 + if not a or not b: + return 0.0 + return len(a & b) / len(a | b) + + +@dataclass +class CommitRecord: + commit_sha: str + patch_id: str + hunk_fingerprints: List[str] + review_round: int + reviewed_at: str + mapping: Dict[str, object] + + +def load_json(path: Path) -> Dict: + if not path.exists(): + return {} + return json.loads(path.read_text(encoding="utf-8")) + + +def save_json(path: Path, payload: Dict) -> None: + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + + +def sanitize_branch_name(branch_name: str) -> str: + return branch_name.replace("/", "-") + + +def resolve_target(args: argparse.Namespace) -> Tuple[str, str]: + if args.target_type and args.target_id: + target_type = args.target_type + target_id = args.target_id + elif args.pr_number is not None: + target_type = "pr" + target_id = str(args.pr_number) + elif args.branch_name: + target_type = "branch" + target_id = args.branch_name + else: + raise SystemExit("must provide either --target-type/--target-id or --pr-number or --branch-name") + if target_type not in {"pr", "branch"}: + raise SystemExit("target type must be pr or branch") + return target_type, target_id + + +def ensure_commit_exists(repo_root: Path, sha: str, target_type: str, target_id: str) -> bool: + exists = run_git_no_check(repo_root, ["cat-file", "-e", f"{sha}^{{commit}}"]) + if exists.returncode == 0: + return True + + # First generic fetch to cover normal branch updates. + run_git_no_check(repo_root, ["fetch", "--all", "--prune", "--tags"]) + exists = run_git_no_check(repo_root, ["cat-file", "-e", f"{sha}^{{commit}}"]) + if exists.returncode == 0: + return True + + # Then PR-specific fetch for detached PR heads. + if target_type == "pr": + run_git_no_check(repo_root, ["fetch", "origin", f"pull/{target_id}/head"]) + exists = run_git_no_check(repo_root, ["cat-file", "-e", f"{sha}^{{commit}}"]) + if exists.returncode == 0: + return True + return False + + +def main() -> None: + parser = argparse.ArgumentParser(description="Map reviewed commits for incremental PR/branch review.") + parser.add_argument("--repo-root", required=True) + parser.add_argument("--target-type", choices=["pr", "branch"]) + parser.add_argument("--target-id") + parser.add_argument("--pr-number", type=int, help="PR number (legacy compatible)") + parser.add_argument("--branch-name", help="Branch name (legacy compatible)") + parser.add_argument("--base", required=True, help="Base commit SHA for comparison") + parser.add_argument("--head", required=True, help="Head commit SHA for comparison") + parser.add_argument("--review-round", required=True, type=int) + parser.add_argument("--commit-map-threshold", type=float, default=0.9) + parser.add_argument("--hunk-match-threshold", type=float, default=0.8) + parser.add_argument("--snapshot-match-threshold", type=float, default=0.9) + args = parser.parse_args() + + repo_root = Path(args.repo_root).resolve() + target_type, target_id_raw = resolve_target(args) + target_id_dir = sanitize_branch_name(target_id_raw) if target_type == "branch" else target_id_raw + review_dir = repo_root / "code-review" / f"{target_type}-{target_id_dir}" + reviewed_path = review_dir / "reviewed_commits.json" + reviewed = load_json(reviewed_path) or {"version": "1.0", "review_rounds": [], "commits": []} + old_commits = reviewed.get("commits", []) + snapshot_map = load_latest_snapshot_map(review_dir) + + if not ensure_commit_exists(repo_root, args.base, target_type, target_id_raw): + raise SystemExit( + f"missing base commit object: {args.base}. " + "Please fetch the base branch history, then retry." + ) + if not ensure_commit_exists(repo_root, args.head, target_type, target_id_raw): + raise SystemExit( + f"missing head commit object: {args.head}. " + "For PR review, try: git fetch origin pull//head" + ) + + try: + rev_list_output = run_git(repo_root, ["rev-list", "--reverse", f"{args.base}..{args.head}"]) + except subprocess.CalledProcessError as e: + stderr = (e.stderr or "").strip() + raise SystemExit( + f"failed to build commit range {args.base}..{args.head}: {stderr or 'unknown git error'}" + ) + + current_commits = [sha for sha in rev_list_output.splitlines() if sha] + current_set = set(current_commits) + commit_files_map: Dict[str, List[str]] = {} + current_changed_files: Set[str] = set() + for sha in current_commits: + files = get_commit_files(repo_root, sha) + commit_files_map[sha] = files + current_changed_files.update(files) + + old_by_sha = {c.get("commit_sha"): c for c in old_commits if c.get("commit_sha")} + old_by_patch_id: Dict[str, Dict] = {} + for c in old_commits: + pid = c.get("patch_id") + if pid and pid not in old_by_patch_id: + old_by_patch_id[pid] = c + + mapped: Dict[str, CommitRecord] = {} + unchanged_by_sha = 0 + for sha in current_commits: + if sha in old_by_sha: + oc = old_by_sha[sha] + mapped[sha] = CommitRecord( + commit_sha=sha, + patch_id=oc.get("patch_id", ""), + hunk_fingerprints=oc.get("hunk_fingerprints", []), + review_round=oc.get("review_round", args.review_round), + reviewed_at=oc.get("reviewed_at", utc_now()), + mapping={"method": "direct", "mapped_from_commit": sha, "confidence": 1.0}, + ) + unchanged_by_sha += 1 + + for sha in current_commits: + if sha in mapped: + continue + pid = compute_patch_id(repo_root, sha) + if pid and pid in old_by_patch_id: + oc = old_by_patch_id[pid] + mapped[sha] = CommitRecord( + commit_sha=sha, + patch_id=pid, + hunk_fingerprints=oc.get("hunk_fingerprints", []), + review_round=oc.get("review_round", args.review_round), + reviewed_at=oc.get("reviewed_at", utc_now()), + mapping={"method": "patch-id", "mapped_from_commit": oc.get("commit_sha", ""), "confidence": 0.98}, + ) + + old_unmapped = [c for c in old_commits if c.get("commit_sha") not in current_set] + old_hunk_sets = { + c.get("commit_sha", ""): set(c.get("hunk_fingerprints", [])) for c in old_unmapped if c.get("commit_sha") + } + + for sha in current_commits: + if sha in mapped: + continue + new_hunks = set(parse_hunk_fingerprints(repo_root, sha)) + best_score = 0.0 + best_old = "" + for old_sha, old_hunks in old_hunk_sets.items(): + score = jaccard(new_hunks, old_hunks) + if score > best_score: + best_score = score + best_old = old_sha + if best_old and best_score >= args.hunk_match_threshold: + mapped[sha] = CommitRecord( + commit_sha=sha, + patch_id=compute_patch_id(repo_root, sha), + hunk_fingerprints=sorted(new_hunks), + review_round=args.review_round, + reviewed_at=utc_now(), + mapping={"method": "hunk-similarity", "mapped_from_commit": best_old, "confidence": round(best_score, 4)}, + ) + + need_review: List[str] = [sha for sha in current_commits if sha not in mapped] + + # commit_map_rate measures "how many OLD commits are accounted for in the + # new commit set", NOT "what fraction of current commits are mapped". + # Denominator = old commit count (the baseline we reviewed before). + # This way appending new commits doesn't penalise the rate, while rebase + # that loses old commits correctly lowers it. + old_commits_covered: Set[str] = set() + for rec in mapped.values(): + from_sha = rec.mapping.get("mapped_from_commit", "") + if from_sha: + old_commits_covered.add(from_sha) + old_commit_count = len(old_commits) + commit_map_rate = (len(old_commits_covered) / old_commit_count) if old_commit_count > 0 else 1.0 + + if need_review: + hunk_scores: List[float] = [] + for sha in need_review: + new_hunks = set(parse_hunk_fingerprints(repo_root, sha)) + best = 0.0 + for old_hunks in old_hunk_sets.values(): + best = max(best, jaccard(new_hunks, old_hunks)) + hunk_scores.append(best) + hunk_match_rate = (sum(hunk_scores) / len(hunk_scores)) if hunk_scores else 1.0 + else: + hunk_match_rate = 1.0 + + if commit_map_rate >= args.commit_map_threshold: + recommendation = "incremental" + elif hunk_match_rate >= args.hunk_match_threshold: + recommendation = "partial" + else: + recommendation = "full" + + snapshot_match_rate = compute_snapshot_match_rate(repo_root, args.head, current_changed_files, snapshot_map) + if recommendation == "full" and snapshot_match_rate is not None and snapshot_match_rate >= args.snapshot_match_threshold: + # For squash/rebase-conflict scenarios, snapshot evidence can safely + # downgrade from full to partial. + recommendation = "partial" + + round_record = { + "review_round": args.review_round, + "generated_at": utc_now(), + "base": args.base, + "head": args.head, + "stats": { + "total_commits": len(current_commits), + "mapped_commits": len(mapped), + "direct_sha_hits": unchanged_by_sha, + "commit_map_rate": round(commit_map_rate, 4), + "hunk_match_rate": round(hunk_match_rate, 4), + "snapshot_match_rate": round(snapshot_match_rate, 4) if snapshot_match_rate is not None else None, + "recommendation": recommendation, + }, + "need_review_commits": need_review, + } + + merged_commits = [c for c in old_commits if c.get("commit_sha") not in current_set] + for sha in current_commits: + if sha in mapped: + c = mapped[sha] + merged_commits.append( + { + "commit_sha": c.commit_sha, + "patch_id": c.patch_id, + "review_round": c.review_round, + "reviewed_at": c.reviewed_at, + "hunk_fingerprints": c.hunk_fingerprints, + "files": commit_files_map.get(sha, []), + "mapping": c.mapping, + } + ) + else: + merged_commits.append( + { + "commit_sha": sha, + "patch_id": compute_patch_id(repo_root, sha), + "review_round": args.review_round, + "reviewed_at": "", + "hunk_fingerprints": parse_hunk_fingerprints(repo_root, sha), + "files": commit_files_map.get(sha, []), + "mapping": {"method": "none", "mapped_from_commit": "", "confidence": 0.0}, + } + ) + + reviewed["commits"] = merged_commits + reviewed.setdefault("review_rounds", []).append(round_record) + save_json(reviewed_path, reviewed) + + print( + json.dumps( + { + "review_target": {"type": target_type, "id": target_id_raw}, + "total_commits": len(current_commits), + "need_review_commits": need_review, + "commit_map_rate": round(commit_map_rate, 4), + "hunk_match_rate": round(hunk_match_rate, 4), + "snapshot_match_rate": round(snapshot_match_rate, 4) if snapshot_match_rate is not None else None, + "recommendation": recommendation, + }, + ensure_ascii=False, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/code-review/scripts/init_review_workspace.py b/.claude/skills/code-review/scripts/init_review_workspace.py new file mode 100755 index 0000000000..806c3ea481 --- /dev/null +++ b/.claude/skills/code-review/scripts/init_review_workspace.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +import argparse +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, Tuple + + +SCRIPT_DIR = Path(__file__).resolve().parent +SKILL_DIR = SCRIPT_DIR.parent +REF_DIR = SKILL_DIR / "references" + + +def utc_now() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def read_json(path: Path) -> Dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def write_json(path: Path, data: Dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + + +def ensure_file_from_template(target: Path, template: Path, mutate=None) -> None: + if target.exists(): + return + payload = read_json(template) + if mutate: + mutate(payload) + write_json(target, payload) + + +def sanitize_branch_name(branch_name: str) -> str: + return branch_name.replace("/", "-") + + +def resolve_target(args: argparse.Namespace) -> Tuple[str, str]: + if args.target_type and args.target_id: + target_type = args.target_type + target_id = args.target_id + elif args.pr_number is not None: + target_type = "pr" + target_id = str(args.pr_number) + elif args.branch_name: + target_type = "branch" + target_id = args.branch_name + else: + raise SystemExit("must provide either --target-type/--target-id or --pr-number or --branch-name") + if target_type not in {"pr", "branch"}: + raise SystemExit("target type must be pr or branch") + return target_type, target_id + + +def main() -> None: + parser = argparse.ArgumentParser(description="Initialize code-review workspace for PR or branch.") + parser.add_argument("--repo-root", required=True, help="Repository root path") + parser.add_argument("--target-type", choices=["pr", "branch"], help="Review target type") + parser.add_argument("--target-id", help="Review target id (PR number or branch name)") + parser.add_argument("--pr-number", type=int, help="PR number (legacy compatible)") + parser.add_argument("--branch-name", help="Branch name (legacy compatible)") + parser.add_argument("--base-ref", default="", help="PR base ref") + parser.add_argument("--head-ref", default="", help="PR head ref") + parser.add_argument("--base-sha", default="", help="PR base sha") + parser.add_argument("--head-sha", default="", help="PR head sha") + args = parser.parse_args() + + repo_root = Path(args.repo_root).resolve() + target_type, target_id_raw = resolve_target(args) + target_id_dir = sanitize_branch_name(target_id_raw) if target_type == "branch" else target_id_raw + review_dir = repo_root / "code-review" / f"{target_type}-{target_id_dir}" + comments_dir = review_dir / "comments" + + comments_dir.mkdir(parents=True, exist_ok=True) + + meta_path = review_dir / "meta.json" + reviewed_commits_path = review_dir / "reviewed_commits.json" + review_comments_path = comments_dir / "review-comments.json" + comment_status_path = comments_dir / "comment-status.json" + + def mutate_meta(payload: Dict[str, Any]) -> None: + payload["repo"] = str(repo_root) + payload["review_target"]["type"] = target_type + payload["review_target"]["id"] = target_id_raw + payload["review_target"]["base_ref"] = args.base_ref + payload["review_target"]["head_ref"] = args.head_ref + payload["review_target"]["base_sha"] = args.base_sha + payload["review_target"]["head_sha"] = args.head_sha + payload["generated_at"] = utc_now() + + def create_review_comments_payload() -> Dict[str, Any]: + return { + "version": "1.0", + "source": "github_pr_review_comments" if target_type == "pr" else "branch_review_comments", + "fetched_at": utc_now(), + "review_target": {"type": target_type, "id": target_id_raw}, + "viewer_login": "", + "comments": [], + } + + def mutate_comment_status(payload: Dict[str, Any]) -> None: + payload["review_target"]["type"] = target_type + payload["review_target"]["id"] = target_id_raw + payload["generated_at"] = utc_now() + payload["status"] = [] + + def mutate_reviewed_commits(payload: Dict[str, Any]) -> None: + payload["review_rounds"] = [] + payload["commits"] = [] + + ensure_file_from_template(meta_path, REF_DIR / "meta.template.json", mutate_meta) + ensure_file_from_template( + reviewed_commits_path, REF_DIR / "reviewed_commits.template.json", mutate_reviewed_commits + ) + if not review_comments_path.exists(): + write_json(review_comments_path, create_review_comments_payload()) + ensure_file_from_template( + comment_status_path, REF_DIR / "comment-status.template.json", mutate_comment_status + ) + + print(str(review_dir)) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/code-review/scripts/update_comment_status.py b/.claude/skills/code-review/scripts/update_comment_status.py new file mode 100755 index 0000000000..007490eb76 --- /dev/null +++ b/.claude/skills/code-review/scripts/update_comment_status.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +import argparse +import hashlib +import json +import re +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, List, Tuple + + +def utc_now() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def stable_hash(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def normalize_text(text: str) -> str: + text = re.sub(r"\s+", " ", text.strip()) + return text + + +def infer_flow_status_from_comment(comment: Dict) -> str: + # Single deterministic rule from upstream schema. + return "resolved" if comment.get("thread_resolved") is True else "open" + + +def read_json(path: Path) -> Dict: + if not path.exists(): + return {} + return json.loads(path.read_text(encoding="utf-8")) + + +def write_json(path: Path, payload: Dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + + +def default_status(comment: Dict) -> Dict: + body = comment.get("body", "") + snippet = normalize_text(body)[:300] + fingerprint_seed = "|".join( + [ + str(comment.get("path", "")), + str(comment.get("line", 0)), + str(comment.get("side", "RIGHT")), + snippet, + ] + ) + return { + "comment_id": comment.get("comment_id"), + "path": comment.get("path", ""), + "line": comment.get("line", 0), + "side": comment.get("side", "RIGHT"), + "body": body, + "snippet": snippet, + "snippet_fingerprint": stable_hash(fingerprint_seed), + "status_flow": infer_flow_status_from_comment(comment), + # status_tech is owned by model review in later phase. + "status_tech": "not-fixed", + "mapped_finding_id": "", + "notes": "", + } + + +def sanitize_branch_name(branch_name: str) -> str: + return branch_name.replace("/", "-") + + +def resolve_target(args: argparse.Namespace) -> Tuple[str, str]: + if args.target_type and args.target_id: + target_type = args.target_type + target_id = args.target_id + elif args.pr_number is not None: + target_type = "pr" + target_id = str(args.pr_number) + elif args.branch_name: + target_type = "branch" + target_id = args.branch_name + else: + raise SystemExit("must provide either --target-type/--target-id or --pr-number or --branch-name") + if target_type not in {"pr", "branch"}: + raise SystemExit("target type must be pr or branch") + return target_type, target_id + + +def validate_payload(raw: Dict) -> List[Dict]: + if not isinstance(raw, dict): + raise SystemExit("invalid review-comments.json: root must be object") + comments = raw.get("comments") + if not isinstance(comments, list): + raise SystemExit("invalid review-comments.json: `comments` must be list") + return comments + + +def validate_comment(comment: Dict) -> None: + required = ["comment_id", "path", "line", "side", "body", "thread_resolved"] + missing = [k for k in required if k not in comment] + if missing: + raise SystemExit( + "invalid review comment record: missing required fields " + + ",".join(missing) + ) + + +def infer_manual_tech_override(replies: List[Dict], viewer_login: str) -> str: + if not viewer_login: + return "" + # Prefer the latest explicit override from current reviewer account. + for reply in reversed(replies): + if str(reply.get("author", "")).lower() != viewer_login.lower(): + continue + text = normalize_text(str(reply.get("body", ""))).lower() + if "false-positive" in text or "false positive" in text or "假阳性" in text or "误判" in text: + return "false-positive" + if re.search(r"\bfixed\b", text) or "已修复" in text: + return "fixed" + return "" + + +def main() -> None: + parser = argparse.ArgumentParser(description="Build comment-status.json from review comments.") + parser.add_argument("--repo-root", required=True) + parser.add_argument("--target-type", choices=["pr", "branch"]) + parser.add_argument("--target-id") + parser.add_argument("--pr-number", type=int, help="PR number (legacy compatible)") + parser.add_argument("--branch-name", help="Branch name (legacy compatible)") + args = parser.parse_args() + + repo_root = Path(args.repo_root).resolve() + target_type, target_id_raw = resolve_target(args) + target_id_dir = sanitize_branch_name(target_id_raw) if target_type == "branch" else target_id_raw + review_dir = repo_root / "code-review" / f"{target_type}-{target_id_dir}" + comments_path = review_dir / "comments" / "review-comments.json" + status_path = review_dir / "comments" / "comment-status.json" + + comments_payload = read_json(comments_path) + if not comments_payload: + raise SystemExit(f"missing comments file: {comments_path}") + comments = validate_payload(comments_payload) + viewer_login = str(comments_payload.get("viewer_login", "")).strip() + + replies_by_parent: Dict[int, List[Dict]] = {} + root_comments: List[Dict] = [] + for c in comments: + parent = c.get("in_reply_to_id") + if parent is None: + root_comments.append(c) + else: + replies_by_parent.setdefault(parent, []).append(c) + + previous = read_json(status_path) + previous_map = {item.get("comment_id"): item for item in previous.get("status", [])} + + status: List[Dict] = [] + seen_fp = set() + for comment in root_comments: + validate_comment(comment) + cid = comment.get("comment_id") + if cid in previous_map: + item = previous_map[cid] + # Preserve manual/model edits on status_tech/notes, always sync flow status from source. + item["status_flow"] = infer_flow_status_from_comment(comment) + else: + item = default_status(comment) + + fp = item.get("snippet_fingerprint", "") + if fp and fp in seen_fp: + item["notes"] = (item.get("notes", "") + " duplicate-fingerprint").strip() + manual_override = infer_manual_tech_override(replies_by_parent.get(cid, []), viewer_login) + if manual_override: + item["status_tech"] = manual_override + item["notes"] = (item.get("notes", "") + f" manual-tech-override:{manual_override}").strip() + seen_fp.add(fp) + status.append(item) + + payload = { + "version": "1.0", + "generated_at": utc_now(), + "review_target": {"type": target_type, "id": target_id_raw}, + "status": status, + } + write_json(status_path, payload) + + print( + json.dumps( + {"review_target": {"type": target_type, "id": target_id_raw}, "status_count": len(status)}, + ensure_ascii=False, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/commit/SKILL.md b/.claude/skills/commit/SKILL.md new file mode 100644 index 0000000000..046d2af45b --- /dev/null +++ b/.claude/skills/commit/SKILL.md @@ -0,0 +1,40 @@ +--- +name: commit +description: Write commit messages following to Conventional Commits standards. +--- +# Commit Skill + +Generate commit messages that follow the Conventional Commits specification. + +## Format + +``` +type(scope): verb + object + +{why is this change needed, what user/system impact it brings} + +Fixes #{ISSUE_ID} +``` + +## Fields + +- **type**: `feat | fix | docs | style | refactor | perf | test | chore | revert` +- **scope**: Optional. File/module/subsystem, e.g. `api`, `ui`, `auth`, `deps` +- **subject**: <= 50 characters, imperative mood, lowercase first letter, no period +- **body**: Each line <= 72 characters. Explain "what" and "why" +- **footer**: Optional. Link Issue / PR / Breaking Change + +## Steps + +1. Collect information by reading `git diff`. Skip if user already provided context. +2. Determine the commit type based on changes. +3. If changes span multiple scopes, use the core module as scope. +4. Extract added/modified/deleted functions, classes, interfaces for the subject. +5. If breaking change, add `BREAKING CHANGE:` to footer. +6. Present the complete commit message for user confirmation before executing `git commit`. + +## Prohibited + +- No meaningless descriptions like "update code", "fix bug", "wip" +- No subject or body lines exceeding 72 characters +- No issue links in the subject line diff --git a/.claude/skills/compile/SKILL.md b/.claude/skills/compile/SKILL.md new file mode 100644 index 0000000000..26d6bc8405 --- /dev/null +++ b/.claude/skills/compile/SKILL.md @@ -0,0 +1,83 @@ +--- +name: compile +description: Building LoongCollector C++ and Go components. Use when compiling any part of the project. +--- +# Compile Skill + +## How to Compile This Project + +This project has both C++ and Go components. Use the appropriate build method based on what you modified. + +### C++ Build + +**IMPORTANT: All CMake and make commands must run from inside the `build/` directory.** Running from repo root will reconfigure incorrectly. + +**Prerequisites** — Git submodules must be populated before first build: +```bash +git submodule update --init --recursive +``` +Two submodules live under `core/_thirdparty/`: +- `DCGM` — NVIDIA DCGM headers (`dcgm_agent.h` etc.) +- `coolbpf` — eBPF framework + +If either is empty, compilation fails with `No such file or directory`. + +#### Build Steps + +```bash +mkdir -p build && cd build +cmake -DCMAKE_BUILD_TYPE=Debug -DLOGTAIL_VERSION=0.0.1 \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ + -DCMAKE_CXX_FLAGS="-I/opt/rh/devtoolset-9/root/usr/lib/gcc/x86_64-redhat-linux/9/include -I/opt/logtail -I/opt/logtail_spl" \ + -DBUILD_LOGTAIL=ON -DBUILD_LOGTAIL_UT=ON -DWITHOUTGDB=ON -DENABLE_STATIC_LINK_CRT=ON -DWITHSPL=OFF ../core +make -sj$(nproc) +``` + +**Key CMake flags:** +| Flag | Purpose | +|------|---------| +| `BUILD_LOGTAIL` | Build LoongCollector binary. Required. | +| `BUILD_LOGTAIL_UT` | Build unit tests. Enable when modifying tests. | +| `WITHSPL` | SPL support. Set `OFF` unless working on SPL files. | + +#### C++ Unit Tests + +Each test directory under `core/unittest/*/` produces its own executable. + +**Build tests** (from inside `build/`): +```bash +make yaml_util_unittest app_config_unittest safe_queue_unittest -j$(nproc) +``` + +**Run tests** (from inside `build/`): +```bash +./unittest/common/yaml_util_unittest +./unittest/app_config/app_config_unittest +``` + +Tests must run from `build/` because some rely on relative paths for config files and temporary output. + +### Go Plugin Build + +```bash +make plugin_local +``` + +### Docker Build + +```bash +make image +``` + +### Cross-Compilation + +For ARM64: +```bash +make image ARCH=arm64 +``` + +### Common Issues + +- If CMake complains about missing dependencies, install them via `apt` or `yum` +- If linking fails, try `make clean` then rebuild +- For SPL-related builds, change `WITHSPL=OFF` to `WITHSPL=ON` in the cmake command diff --git a/.claude/skills/design-document/SKILL.md b/.claude/skills/design-document/SKILL.md new file mode 100644 index 0000000000..d62cebdfa9 --- /dev/null +++ b/.claude/skills/design-document/SKILL.md @@ -0,0 +1,98 @@ +--- +name: design-document +description: Design document writing conventions. Use when writing or reviewing technical design documents. +--- +# Design Document Conventions + +## 1. Background / Problem Statement + +### 1.1 Background and Pain Points +- Describe current system/module limitations and deficiencies +- List specific scenarios, metrics, or incident cases that triggered this design + +### 1.2 Impact Scope +- Affected modules, microservices, APIs, data stores, third-party dependencies +- Potential impact on performance, reliability, cost, maintainability +- Forward/backward compatibility analysis + +### 1.3 Constraints +- Compliance/security/performance/resource restrictions +- External system or infrastructure dependencies + +--- + +## 2. Design Goals + +### 2.1 Functional Goals +- List Must/Should/Could core capabilities by priority + +### 2.2 Non-Functional Goals +- Performance (throughput, latency, concurrency, resource usage) +- Scalability, maintainability, testability, observability +- Reliability (fault tolerance, HA, degradation, rollback strategies) + +### 2.3 Constraint Goals +- Backward compatibility, API stability +- Security and compliance requirements + +--- + +## 3. Technical Design + +### 3.1 Architecture Diagram +- Use Mermaid for high-level component diagrams with data/control flow + +### 3.2 Detailed Flowcharts +- Key business flows, exception flows, retry/compensation with timing and triggers + +### 3.3 Thread/Concurrency Model +- Thread lifecycle, inter-thread communication (locks, condition variables, queues, Actor patterns) +- Sequence diagrams for concurrency interactions + +### 3.4 Core Classes and Data Structures +- Class diagrams showing main classes, interfaces, inheritance/composition relationships +- Key data structure fields, lifecycle, thread-safety strategy + +### 3.5 Key Algorithms or Protocols +- Pseudocode or flow for pub/sub, load balancing, retry backoff, etc. +- State machine / protocol state transition diagrams + +### 3.6 Error Handling and Recovery +- Error classification, exception stack, retry strategies, degradation plans +- Monitoring metrics, alert trigger conditions and levels + +### 3.7 Deployment and Operations +- Configuration items, hot-update mechanisms, canary and rollback strategies +- CI/CD, container, Service Mesh, Kubernetes resource considerations + +--- + +## 4. Unit Testing + +### 4.1 Test Scope and Goals +- Cover core logic, boundary conditions, concurrency scenarios, exception paths + +### 4.2 Test Environment and Tools +- Google Test/Mock version, necessary third-party stubs/fakes + +### 4.3 Test Scenarios and Cases +| Case ID | Scenario | Input | Expected Output/Behavior | Mock Dependencies | +|---------|----------|-------|--------------------------|-------------------| +| TC-01 | Normal single log push | Single valid LogRecord | Returns SUCCESS, buffer size +1 | None | +| TC-02 | Buffer full | capacity=N filled | Throws BufferOverflowException | None | +| TC-03 | Concurrent push | Multi-thread simultaneous push | No data loss, order/final consistency matches design | MutexMock | +| TC-04 | flush clears | M items exist, then flush | Returns M items, buffer size=0 | TimeProviderMock | + +### 4.4 Boundary and Exception Testing +- Empty input, invalid input, extreme capacity, network/disk fault injection + +### 4.5 Performance Benchmarking (optional) +- Throughput, latency, CPU/Memory profile; comparison with baseline + +--- + +## Notes + +- **Do not** include project management info (estimates, schedules, milestones, Gantt charts) +- Code examples must follow team C++ coding standards (see `.claude/skills/project-knowledge/`) +- Test case naming: `__` for CI coverage tracking diff --git a/.claude/skills/e2e/SKILL.md b/.claude/skills/e2e/SKILL.md new file mode 100644 index 0000000000..92a8c4df6b --- /dev/null +++ b/.claude/skills/e2e/SKILL.md @@ -0,0 +1,209 @@ +--- +name: e2e +description: LoongCollector E2E 测试全流程指南:设计、编写、运行和调试。当需要编写新 E2E 测试、运行现有测试、或排查 E2E 测试失败时使用此 skill。 +--- +# LoongCollector E2E 测试指南 + +> 详细步骤模板见 [reference.md](reference.md) | 可复用脚本见 [scripts/](scripts/) + +## 目录 + +1. [概览](#1-概览) +2. [设计测试用例](#2-设计测试用例) +3. [编写测试用例](#3-编写测试用例) +4. [本地运行(docker-compose)](#4-本地运行) +5. [调试](#5-调试) +6. [已知陷阱](#6-已知陷阱) + +--- + +## 1 概览 + +基于 **BDD Godog** 框架,通过 `.feature` 文件描述场景,引擎正则匹配步骤函数并传参。 + +``` +test/e2e/ + test_cases// + case.feature # 场景描述 + docker-compose.yaml # 可选,外部依赖服务 + engine/ + steps.go # 所有可用步骤(权威来源) + setup/ control/ trigger/ verify/ cleanup/ +``` + +**环境 tag**:`@host`、`@k8s`、`@docker-compose`(三选一,加 `@e2e`) + +--- + +## 2 设计测试用例 + +编写 feature 文件前,先确定测试矩阵。按以下维度逐项评估是否需要覆盖: + +### 2.1 场景维度清单 + +| 维度 | 典型场景 | 何时需要 | +|------|----------|----------| +| **基础功能** | 单配置、单数据类型端到端 | 必须 | +| **多数据类型** | logs / metrics / traces 分别验证 | 插件支持多类型时 | +| **多配置共存** | 同时加载多个 pipeline 配置 | 涉及端口/资源竞争时 | +| **配置热加载** | 运行中增/删/改配置 | 持续运行的 input 插件 | +| **配置类型变更** | 从 A 类型切换到 B 类型 | 插件支持多协议/格式时 | +| **反压与恢复** | 下游不可达 → 恢复后数据不丢 | flusher 插件 | +| **外部依赖失效** | 依赖服务重启/不可达 | 有外部依赖时 | +| **大数据量** | 高吞吐压力下不 OOM/不丢数据 | 性能敏感路径 | + +### 2.2 设计产出 + +确定要覆盖的场景后,明确每个 Scenario 的: +- **输入**:什么数据、什么格式、多少条 +- **流经路径**:input → processor → flusher 的具体插件 +- **预期输出**:在哪里验证、验证什么 +- **外部依赖**:需要什么辅助服务(OTel Collector、Kafka 等) + +--- + +## 3 编写测试用例 + +### 3.1 目录结构 + +``` +test/e2e/test_cases/my_feature/ +├── case.feature +├── docker-compose.yaml # 外部依赖 +└── otel-collector-config.yaml # 如果用 OTel Collector +``` + +### 3.2 Feature 文件模板 + +```gherkin +@flusher +Feature: my feature name + Brief description + + @e2e @docker-compose + Scenario: TestMyFeatureLogs + Given {docker-compose} environment + Given {my-config} local config as below + """ + enable: true + inputs: + - Type: input_forward + Protocol: OTLP + Endpoint: "0.0.0.0:4320" + flushers: + - Type: flusher_otlp_native + Endpoint: "otel-collector:4317" + """ + When start docker-compose {my_feature} + Then wait {10} seconds + When generate {1} OTLP {logs} via otelgen to endpoint {loongcollectorC:4320}, protocol {grpc} + Then wait {5} seconds + Then otlp collector received at least {1} logs from file {/tmp/otel-export/logs.json} +``` + +### 3.3 强制规则 + +- 配置中必须含 `enable: true` +- **只使用** `test/engine/steps.go` 中已注册的步骤 +- `wait {N} seconds` 是 **Then** 类型,不是 When +- 命名格式:`Test${功能名}${场景描述}` +- **不要**在持续运行插件的配置中使用 `global.ExcutionTimeout`(见 §6.1) + +### 3.4 扩展步骤 + +如需新步骤,参考 [reference.md §扩展步骤](reference.md) 中的开发和注册流程。 + +--- + +## 4 本地运行 + +### 4.1 前置条件 + +```bash +docker --version && docker compose version +``` + +如修改了 C++ 代码,需重新编译并更新镜像。两种方式: + +**方式一:完整构建**(慢,但保证一致) +```bash +make e2e_image # 从源码构建完整 Docker 镜像 aliyun/loongcollector:0.0.1 +``` + +**方式二:增量更新**(快,适合迭代调试) +```bash +cd build && make -sj$(nproc) && cd .. +# 替换镜像中的二进制 +docker create --name tmp-lc aliyun/loongcollector:0.0.1 +docker cp build/loongcollector tmp-lc:/usr/local/loongcollector/loongcollector +docker commit tmp-lc aliyun/loongcollector:0.0.1 +docker rm tmp-lc +``` + +### 4.2 运行 + +```bash +cd test/e2e + +# 运行整个测试用例(所有 Scenario) +TEST_CASE=flusher_otlp_native go test -v -run "TestE2EOnDockerCompose$" \ + -timeout 600s -count=1 ./... + +# 只运行指定 Scenario +TEST_CASE=flusher_otlp_native go test -v \ + -run "TestE2EOnDockerCompose/TestFlusherOTLPNativeLogs$" \ + -timeout 600s -count=1 ./... +``` + +### 4.3 清理(测试失败后必做) + +可以直接运行脚本 `bash .cursor/skills/e2e/scripts/e2e-cleanup.sh`,或手动执行: + +```bash +docker rm -f $(docker ps -aq) 2>/dev/null +docker network prune -f +rm -rf test/e2e/config test/e2e/onetime_pipeline_config +sudo rm -rf test/e2e/report +rm -f test/e2e/test_cases//testcase-compose.yaml +``` + +--- + +## 5 调试 + +```bash +# 1. 查看容器日志 +docker ps | grep loongcollectorC +docker exec cat /usr/local/loongcollector/log/loongcollector.LOG + +# 2. 检查配置是否加载 +docker exec ls /usr/local/loongcollector/conf/continuous_pipeline_config/local/ + +# 3. 检查端口是否监听 +docker exec ss -tlnp | grep + +# 4. 手动复现 compose 环境 +cd test/e2e/test_cases/ +docker compose -f testcase-compose.yaml up -d +docker compose -f testcase-compose.yaml logs -f loongcollectorC +``` + +--- + +## 6 已知陷阱 + +### 6.1 ExcutionTimeout 使配置变为一次性 + +**绝对不要**在 `input_forward`、`input_file` 等持续插件的配置中使用 `global.ExcutionTimeout`。 + +它会使 `IsOnetime()` 返回 true,导致 `IsValidNativeInputPlugin(name, true)` 在 onetime 注册表中查找,而大部分 input 只注册了 continuous,结果报 `unsupported input plugin`。 + +详见 `.cursor/rules/project-knowledge/config-pitfalls.mdc`。 + +### 6.2 FlusherFile 必须是文件 + +e2e 模板将 `report/default_flusher.json` bind-mount 到容器。若宿主机路径不存在,Docker 会创建为**目录**。已在 `BootController.Start()` 中自动处理。 + +### 6.3 测试间残留 + +多 Scenario 共享进程,`Clean()` 会删除 config/report。异常退出后手动清理(§4.3)。 diff --git a/.claude/skills/e2e/reference.md b/.claude/skills/e2e/reference.md new file mode 100644 index 0000000000..d1c662ae35 --- /dev/null +++ b/.claude/skills/e2e/reference.md @@ -0,0 +1,134 @@ +# E2E 测试详细参考 + +## 可用步骤速查 + +> 权威来源:`test/engine/steps.go` + +### Given(环境准备) + +| 步骤模板 | 说明 | +|----------|------| +| `{docker-compose} environment` | 初始化 docker-compose 环境 | +| `{host} environment` | 初始化主机环境 | +| `{daemonset} environment` | 初始化 K8s 环境 | +| `{name} local config as below` | 写入持续采集配置 | +| `{name} onetime pipeline local config as below` | 写入一次性采集配置 | +| `subcribe data from {sls} with config` | 订阅 SLS 数据源 | +| `loongcollector depends on containers {name}` | 设置容器依赖 | +| `loongcollector container mount {src} to {dst}` | 挂载卷 | +| `loongcollector expose port {host} to {container}` | 暴露端口 | +| `docker-compose boot type {type}` | 设置 boot 类型 | +| `mkdir {path}` | 创建目录 | + +### When(触发动作) + +| 步骤模板 | 说明 | +|----------|------| +| `start docker-compose {case_name}` | 启动 docker-compose 环境 | +| `begin trigger` | 标记触发开始时间(生成日志前必须调用) | +| `generate {N} regex logs to file {path}, with interval {M}ms` | 生成正则日志 | +| `generate {N} json logs to file {path}, with interval {M}ms` | 生成 JSON 日志 | +| `generate {N} apsara logs to file {path}, with interval {M}ms` | 生成 Apsara 日志 | +| `generate {N} OTLP {logs\|metrics\|traces} via otelgen to endpoint {ep}, protocol {grpc\|http}` | 生成 OTLP 数据 | +| `generate {N} http logs, with interval {M}ms, url: {url}, method: {method}, body:` | 生成 HTTP 日志 | +| `execute {N} commands {cmd} in sequence` | 顺序执行命令 | +| `execute {N} commands {cmd} in parallel` | 并行执行命令 | +| `create the shell script file {name} with the following content` | 创建 shell 脚本 | +| `execute {N} the shell script file {name} in parallel` | 并行执行 shell 脚本 | +| `restart agent` | 重启 Agent | +| `force restart agent` | 强制重启 Agent | + +### Then(结果验证) + +| 步骤模板 | 说明 | +|----------|------| +| `there is {N} logs` | 精确验证日志数(上限 100) | +| `there is at least {N} logs` | 最少日志数验证 | +| `there is less than {N} logs` | 最多日志数验证 | +| `the log fields match kv` | KV 字段匹配(文档内容跟 `"""..."""`) | +| `the log fields match as below` | 日志字段模式匹配 | +| `the log tags match kv` | Tag KV 匹配 | +| `the log is in order` | 日志顺序验证 | +| `wait {N} seconds` | 等待 N 秒 | +| `otlp collector received at least {N} (logs\|metrics\|traces) from file {path}` | OTel Collector 数据验证 | + +> 注意:日志数量验证上限 100。超过 100 用 `When query through` + `Then the log fields match kv` 方式。 + +--- + +## 扩展步骤 + +### 1. 编写函数 + +在 `test/engine/` 对应子目录下: + +```go +func MyVerification(ctx context.Context, expected int) (context.Context, error) { + // 实现逻辑 + return ctx, nil +} +``` + +签名要求:第一个参数 `context.Context`,返回 `(context.Context, error)`。 + +### 2. 注册 + +在 `test/engine/steps.go` 中: + +```go +ctx.Then(`^my verification expects \{(\d+)\}$`, verify.MyVerification) +``` + +### 3. 使用 + +```gherkin +Then my verification expects {42} +``` + +--- + +## docker-compose.yaml 示例 + +### OTel Collector(OTLP 测试用) + +```yaml +services: + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + hostname: otel-collector + user: "0:0" + ports: + - "4317" + volumes: + - ./otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml + - ./otel-export:/tmp/otel-export + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:13133/"] + interval: 5s + timeout: 3s + retries: 5 + start_period: 10s +``` + +--- + +## eBPF 进程安全测试示例 + +```gherkin +@e2e @host @ebpf_input +Scenario: TestEBPFProcessSecurityByNormalStart + Given {host} environment + Given subcribe data from {sls} with config + """ + """ + Given {ebpf_process_security_default} local config as below + """ + enable: true + inputs: + - Type: input_process_security + """ + When begin trigger + When execute {1} commands {/bin/echo 1} in sequence + When query through {* | select * from e2e where call_name = 'execve' and binary = '/bin/echo' and arguments = '1'} + Then there is {1} logs +``` diff --git a/.claude/skills/e2e/scripts/e2e-cleanup.sh b/.claude/skills/e2e/scripts/e2e-cleanup.sh new file mode 100755 index 0000000000..65502faff3 --- /dev/null +++ b/.claude/skills/e2e/scripts/e2e-cleanup.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# E2E 测试环境清理脚本 +# 用法: bash .cursor/skills/e2e/scripts/e2e-cleanup.sh [case_name] +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" +E2E_DIR="$REPO_ROOT/test/e2e" +CASE_NAME="${1:-}" + +echo "==> 停止并删除所有 Docker 容器..." +docker rm -f $(docker ps -aq) 2>/dev/null || true + +echo "==> 清理 Docker 网络..." +docker network prune -f 2>/dev/null || true + +echo "==> 清理运行时目录..." +rm -rf "$E2E_DIR/config" "$E2E_DIR/onetime_pipeline_config" +sudo rm -rf "$E2E_DIR/report" 2>/dev/null || rm -rf "$E2E_DIR/report" 2>/dev/null || true + +if [[ -n "$CASE_NAME" ]]; then + CASE_DIR="$E2E_DIR/test_cases/$CASE_NAME" + if [[ -d "$CASE_DIR" ]]; then + echo "==> 清理测试用例 $CASE_NAME..." + rm -f "$CASE_DIR/testcase-compose.yaml" + rm -f "$CASE_DIR/otel-export/"*.json 2>/dev/null || true + fi +else + echo "==> 清理所有测试用例的 testcase-compose.yaml..." + find "$E2E_DIR/test_cases" -name "testcase-compose.yaml" -delete 2>/dev/null || true +fi + +echo "==> 清理完成" diff --git a/.claude/skills/mermaid/SKILL.md b/.claude/skills/mermaid/SKILL.md new file mode 100644 index 0000000000..3cccf610d8 --- /dev/null +++ b/.claude/skills/mermaid/SKILL.md @@ -0,0 +1,42 @@ +--- +name: mermaid +description: Mermaid diagram conventions. Use whenever diagrams are needed in documentation or code review. +--- +# Mermaid Diagram Conventions + +## Rules for Creating Mermaid Diagrams + +1. **Use Correct Fenced Code Block**: Always use ````mermaid ... ```` + +2. **Stick to Well-Supported Diagram Types**: + - `graph` (flowcharts, `TD` preferred for readability) + - `sequenceDiagram` + - `classDiagram` + - `stateDiagram-v2` (prefer v2) + - `erDiagram` + - `pie`, `gantt`, `mindmap` (basic only) + - Avoid very new or uncommon types + +3. **Simple Standard Syntax**: + - **Node IDs**: Use simple alphanumeric IDs (`node1`, `processA`). No spaces or special chars. + - **Labels**: **Use quotes** for labels with spaces/punctuation/keywords. + - Good: `A["User Input"] --> B["Validate Data"];` + - Bad: `A[User Input] --> B[Validate Data];` + - Use standard arrows (`-->`, `---`, `==>`) + - Comments: `%%` + +4. **Mindmap (GitHub compatible)**: + - Use basic indentation structure only + - NO `::icon()` syntax (causes rendering errors) + - Each node on its own line with correct indentation + +5. **Prefer Vertical Layouts**: `graph TD` or `graph TB` for flowcharts (easier to read in Markdown) + +6. **Let GitHub Handle Styling**: + - DO NOT set themes (`%%{init: ...}`) + - DO NOT use `classDef` or `style` + - GitHub auto-adapts to light/dark mode + +7. **Keep Diagrams Focused**: Break complex diagrams into multiple simpler ones + +8. **Always Review Automated Edits**: Tools may break Mermaid syntax, especially with indentation-heavy formats like mindmap diff --git a/.claude/skills/omc-reference/SKILL.md b/.claude/skills/omc-reference/SKILL.md new file mode 100644 index 0000000000..cc02915c07 --- /dev/null +++ b/.claude/skills/omc-reference/SKILL.md @@ -0,0 +1,141 @@ +--- +name: omc-reference +description: OMC agent catalog, available tools, team pipeline routing, commit protocol, and skills registry. Auto-loads when delegating to agents, using OMC tools, orchestrating teams, making commits, or invoking skills. +user-invocable: false +--- + +# OMC Reference + +Use this built-in reference when you need detailed OMC catalog information that does not need to live in every `CLAUDE.md` session. + +## Agent Catalog + +Prefix: `oh-my-claudecode:`. See `agents/*.md` for full prompts. + +- `explore` (haiku) — fast codebase search and mapping +- `analyst` (opus) — requirements clarity and hidden constraints +- `planner` (opus) — sequencing and execution plans +- `architect` (opus) — system design, boundaries, and long-horizon tradeoffs +- `debugger` (sonnet) — root-cause analysis and failure diagnosis +- `executor` (sonnet) — implementation and refactoring +- `verifier` (sonnet) — completion evidence and validation +- `tracer` (sonnet) — trace gathering and evidence capture +- `security-reviewer` (sonnet) — trust boundaries and vulnerabilities +- `code-reviewer` (opus) — comprehensive code review +- `test-engineer` (sonnet) — testing strategy and regression coverage +- `designer` (sonnet) — UX and interaction design +- `writer` (haiku) — documentation and concise content work +- `qa-tester` (sonnet) — runtime/manual validation +- `scientist` (sonnet) — data analysis and statistical reasoning +- `document-specialist` (sonnet) — SDK/API/framework documentation lookup +- `git-master` (sonnet) — commit strategy and history hygiene +- `code-simplifier` (opus) — behavior-preserving simplification +- `critic` (opus) — plan/design challenge and review + +## Model Routing + +- `haiku` — quick lookups, lightweight inspection, narrow docs work +- `sonnet` — standard implementation, debugging, and review +- `opus` — architecture, deep analysis, consensus planning, and high-risk review + +## Tools Reference + +### External AI / orchestration +- `/team N:executor "task"` +- `omc team N:codex|gemini "..."` +- `omc ask ` +- `/ccg` + +### OMC state +- `state_read`, `state_write`, `state_clear`, `state_list_active`, `state_get_status` + +### Team runtime +- `TeamCreate`, `TeamDelete`, `SendMessage`, `TaskCreate`, `TaskList`, `TaskGet`, `TaskUpdate` + +### Notepad +- `notepad_read`, `notepad_write_priority`, `notepad_write_working`, `notepad_write_manual` + +### Project memory +- `project_memory_read`, `project_memory_write`, `project_memory_add_note`, `project_memory_add_directive` + +### Code intelligence +- LSP: `lsp_hover`, `lsp_goto_definition`, `lsp_find_references`, `lsp_diagnostics`, and related helpers +- AST: `ast_grep_search`, `ast_grep_replace` +- Utility: `python_repl` + +## Skills Registry + +Invoke built-in workflows via `/oh-my-claudecode:`. + +### Workflow skills +- `autopilot` — full autonomous execution from idea to working code +- `ralph` — persistence loop until completion with verification +- `ultrawork` — high-throughput parallel execution +- `visual-verdict` — structured visual QA verdicts +- `team` — coordinated team orchestration +- `ccg` — Codex + Gemini + Claude synthesis lane +- `ultraqa` — QA cycle: test, verify, fix, repeat +- `omc-plan` — planning workflow and `/plan`-safe alias +- `ralplan` — consensus planning workflow +- `sciomc` — science/research workflow +- `external-context` — external docs/research workflow +- `deepinit` — hierarchical AGENTS.md generation +- `deep-interview` — Socratic ambiguity-gated requirements workflow +- `ai-slop-cleaner` — regression-safe cleanup workflow + +### Utility skills +- `ask`, `cancel`, `note`, `learner`, `omc-setup`, `mcp-setup`, `hud`, `omc-doctor`, `trace`, `release`, `project-session-manager`, `skill`, `writer-memory`, `configure-notifications` + +### Keyword triggers kept compact in CLAUDE.md +- `"autopilot"→autopilot` +- `"ralph"→ralph` +- `"ulw"→ultrawork` +- `"ccg"→ccg` +- `"ralplan"→ralplan` +- `"deep interview"→deep-interview` +- `"deslop" / "anti-slop"→ai-slop-cleaner` +- `"deep-analyze"→analysis mode` +- `"tdd"→TDD mode` +- `"deepsearch"→codebase search` +- `"ultrathink"→deep reasoning` +- `"cancelomc"→cancel` +- Team orchestration is explicit via `/team`. + +## Team Pipeline + +Stages: `team-plan` → `team-prd` → `team-exec` → `team-verify` → `team-fix` (loop). + +- Use `team-fix` for bounded remediation loops. +- `team ralph` links the team pipeline with Ralph-style sequential verification. +- Prefer team mode when independent parallel lanes justify the coordination overhead. + +## Commit Protocol + +Use git trailers to preserve decision context in every commit message. + +### Format +- Intent line first: why the change was made +- Optional body with context and rationale +- Structured trailers when applicable + +### Common trailers +- `Constraint:` active constraint shaping the decision +- `Rejected:` alternative considered | reason for rejection +- `Directive:` forward-looking warning or instruction +- `Confidence:` `high` | `medium` | `low` +- `Scope-risk:` `narrow` | `moderate` | `broad` +- `Not-tested:` known verification gap + +### Example +```text +feat(docs): reduce always-loaded OMC instruction footprint + +Move reference-only orchestration content into a native Claude skill so +session-start guidance stays small while detailed OMC reference remains available. + +Constraint: Preserve CLAUDE.md marker-based installation flow +Rejected: Sync all built-in skills in legacy install | broader behavior change than issue requires +Confidence: high +Scope-risk: narrow +Not-tested: End-to-end plugin marketplace install in a fresh Claude profile +``` diff --git a/.claude/skills/project-knowledge/SKILL.md b/.claude/skills/project-knowledge/SKILL.md new file mode 100644 index 0000000000..44127422e6 --- /dev/null +++ b/.claude/skills/project-knowledge/SKILL.md @@ -0,0 +1,220 @@ +--- +name: project-knowledge +description: LoongCollector project knowledge: architecture, terminology, codebase map, and coding standards (C++/Go). +--- +# LoongCollector Project Knowledge + +## Architecture Overview + +The LoongCollector architecture is based on a plugin system with the following key components: + +1. **Core Application**: Main entry point in `core/logtail.cpp`, initializes `Application` class in `core/application/Application.cpp`. Follows singleton pattern, manages overall lifecycle. + +2. **Plugin System**: Supports plugins for data collection, processing, and flushing: + - **Inputs**: Collect data from various sources (files, network, system metrics, etc.) + - **Processors**: Transform and process collected data + - **Flushers**: Send processed data to various backends + +3. **Pipeline Management**: Collection pipelines managed by `CollectionPipelineManager` handle data flow from inputs through processors to flushers. + +4. **Configuration**: Supports both local and remote configuration management with watchers that monitor for configuration changes. + +5. **Queuing System**: Implements various queue types including bounded queues, circular queues, and exactly-once delivery queues for reliable data transmission. + +6. **Monitoring**: Built-in monitoring and metrics collection for tracking the collector's own performance and health. + +## Project Structure + +``` +core/ # Core C++ code + plugin/ # Plugin system + input/ # Data collection input plugins + processor/ # Data processing plugins + flusher/ # Data output plugins (SLS, file, etc.) + collection_pipeline/ # Main pipeline flow (queue, batch, serialization) + config/ # Configuration (loading, providers, feedback) + provider/ # Config providers (Enterprise, Legacy) + common/ # Common utilities, data structures, network, string, crypto + monitor/ # Monitoring, metrics collection, alerting + logger/ # Logging system + checkpoint/ # Checkpoint, state management + app_config/ # Global configuration + models/ # Core data structures (events, logs, metrics) + parser/ # Log parsers + task_pipeline/ # Task scheduling + go_pipeline/ # Go plugin integration + ebpf/ # eBPF collection and plugins + host_monitor/ # Host-level monitoring + shennong/ # Shennong metrics + prometheus/ # Prometheus collection + file_server/ # File collection and management + container_manager/ # Container environment management + application/ # Main application entry + protobuf/ # Protobuf protocol definitions + metadata/ # K8s and other metadata collection + constants/ # Constants + tools/ # Internal utility scripts + unittest/ # Unit tests + legacy_test/ # Historical test cases + +pkg/ # Go packages + helper/ # Go helper functions + containercenter/ # Go container-related functions + +plugin_main/ # Plugin main entry +pluginmanager/ # Go plugin manager (lifecycle, registration) +plugins/ # Go plugin packages + input/ # Go input plugins (docker, etc.) + processor/ # Go processor plugins + flusher/ # Go flusher plugins + aggregator/ # Go aggregator plugins + extension/ # Go extension plugins + all/ # Plugin registration and init + test/ # Go plugin tests + +test/ # Integration tests +e2e/ # E2E test cases (open source Go plugins) +e2e_enterprise/ # E2E enterprise test cases (host + K8s) +docs/ # Project documentation +scripts/ # Build, deploy, test scripts +docker/ # Docker-related files +rpm/ # RPM packaging +external/ # External dependencies +``` + +## Key Dependencies + +### Header-Only Libraries +- `spdlog` - Logging +- `rapidjson` - JSON parsing + +### Compiled Libraries +- **Testing**: `gtest`, `gmock` +- **Serialization**: `protobuf` +- **Regex**: `re2` +- **Hash**: `cityhash` +- **Config**: `jsoncpp`, `yamlcpp` +- **Compression**: `lz4`, `zlib`, `zstd` +- **Network**: `curl`, `ssl`, `crypto` +- **System**: `boost`, `gflags`, `leveldb`, `uuid` +- **Memory**: `tcmalloc` (optional) + +### Tech Stack +- C++ (main implementation, C++17/20) +- Protobuf (data serialization) +- eBPF (kernel-level data collection) +- Prometheus (metrics collection) +- Go (plugin adaptation) +- Shell/Python (build and test scripts) + +## Terminology Glossary + +| Term | Description | +|------|-------------| +| LoongCollector | The observability data collection agent (formerly iLogtail) | +| Pipeline | A data processing chain: Input -> Processor(s) -> Flusher | +| Plugin | A modular component that performs specific data operations | +| Input Plugin | Collects data from a source (file, network, metric, etc.) | +| Processor Plugin | Transforms data (parse, filter, enrich, etc.) | +| Flusher Plugin | Sends data to a destination (SLS, stdout, Prometheus, etc.) | +| Config | Collection configuration defining pipeline behavior | +| Checkpoint | Persistent state tracking for exactly-once delivery | +| Runner | Execution wrapper for a specific plugin instance | +| Queue | Data buffer between pipeline stages | +| Batch | Group of events processed/sent together | +| SLS | Alibaba Cloud Simple Log Service | +| eBPF | Extended Berkeley Packet Filter (kernel tracing) | +| SPL | Structured Processing Language | + +## Codebase Map + +### Key Entry Points and Core Flows + +| Path | Purpose | +|------|---------| +| `core/logtail.cpp` | Main entry point | +| `core/application/Application.cpp` | Application singleton, lifecycle management | +| `core/collection_pipeline/CollectionPipelineManager.cpp` | Pipeline lifecycle | +| `core/collection_pipeline/CollectionPipeline.cpp` | Pipeline execution | +| `core/runner/ProcessorRunner.cpp` | Processor execution | +| `core/runner/FlusherRunner.cpp` | Flusher execution | +| `core/config/watcher/PipelineConfigWatcher.cpp` | Config change detection | +| `core/file_server/FileServer.cpp` | File collection management | +| `core/file_server/checkpoint/CheckpointManagerV2.cpp` | Exactly-once checkpoint | + +### Invariant Rules + +- **Lifecycle**: All plugins follow Init -> Start -> Stop -> Close lifecycle +- **Resource Release**: Every thread/future/queue must be properly cleaned up on stop +- **Config**: Environment variables are case-insensitive with default fallbacks +- **Queue**: Bounded queue with backpressure; pop on disabled queue should not hang +- **Hot Reload**: After config change, system must return to consistent "collect+process+send" state + +### Common Patterns + +- RAII for resource management +- Smart pointers over raw pointers +- Singleton pattern for managers (Application, AlarmManager, WriteMetrics) +- Thread-safe queues with condition variables +- Plugin registration via static initialization + +## C++ Coding Standards + +### Naming +- **PascalCase** for class names, global functions, public methods +- **camelCase** for variable names and private methods +- **SCREAMING_SNAKE_CASE** for macros and constants +- **m** prefix for member variables (e.g., `mUserId`) +- **k** prefix for constants (e.g., `kMaxSendBufferSize`) + +### Modern C++ +- Prefer C++17/20 features (auto, range-based loops, smart pointers) +- Use `std::unique_ptr` / `std::shared_ptr` for memory management +- Prefer `std::optional`, `std::variant`, `std::any` for type-safe alternatives +- Use `constexpr` and `const` for compile-time computations +- Use `std::string_view` for read-only string operations + +### Error Handling +- Use exceptions for error handling (`std::runtime_error`, `std::invalid_argument`) +- RAII for resource management to avoid memory leaks +- Validate inputs at function boundaries +- Log errors using spdlog + +### Performance +- Avoid unnecessary heap allocations; prefer stack-based objects +- Use `std::move` for move semantics +- Optimize loops with `` (e.g., `std::sort`, `std::for_each`) +- Use `std::array` or `std::vector` over raw arrays + +### Security +- Avoid C-style casts; use `static_cast`, `dynamic_cast`, `reinterpret_cast` +- Enforce const-correctness +- Avoid global variables; use singletons sparingly +- Use `enum class` for strongly typed enumerations + +### Testing +- Unit tests using Google Test (GTest) / Google Mock +- Integration tests for system components + +## Go Coding Standards + +### Naming +- **PascalCase** for exported types and functions +- **camelCase** for unexported types and functions +- **snake_case** for variables and constants +- Package names use lowercase + +### Error Handling +- Return errors explicitly, do not panic +- Use `fmt.Errorf` with `%w` for error wrapping +- Check errors at every call site + +### Concurrency +- Use goroutines for concurrent operations +- Use channels or sync primitives for communication +- Avoid goroutine leaks; always provide exit paths + +### Testing +- Use standard `testing` package +- Table-driven tests for function coverage +- Integration tests via E2E framework diff --git a/.claude/skills/review-standards/SKILL.md b/.claude/skills/review-standards/SKILL.md new file mode 100644 index 0000000000..3fafef5dd8 --- /dev/null +++ b/.claude/skills/review-standards/SKILL.md @@ -0,0 +1,255 @@ +--- +name: review-standards +description: Code review behavioral standards. Reference during code review to ensure consistent quality checks from a QA perspective. +--- +# Code Review Rule + +你是一个高级代码审查助手,审查代码时要从QA角度仔细检查问题,以批判的眼光看待代码,以发现潜在问题为目的。 + +为了避免得到假阳性的检查结果,请注意: + +* 分析具体代码片段时要包含足够上下文,不要仅基于局部信息做出判断。 + +* 避免基于记忆进行代码分析,必须基于实际查看的代码。 + +* 在指出问题前,先理解业务逻辑的完整流程,考虑代码设计的合理性和必要性。 + + +请按下面的步骤进行Code Review + +## 1. 获取评审内容,无需输出 + +用户会提供分支或PR信息,请根据以下指示获取评审文件列表和内容 + +1. 如果提供两个分支名称(例如 "fork/feature" 和 "main")。获取评审文件列表和内容的方法是: + + * 运行 `git branch` 和 `git remote` 了解分支是origin分支还是其他远程分支。 + + * 需要使用 `git fetch` 检出分支(如 `fork/feature`、`origin/main`),确保获取最新代码。 + + * 运行 `git checkout fork/feature && git pull` 将内容拉取到本地,以便review时查询完整上下文。 + + * 执行 `git diff --name-only --diff-filter=M origin/main...fork/feature` 来列出被修改的文件。 + + * 对于上述列表中的每个文件,运行 `git diff --quiet origin/main...fork/feature -- `获取变更内容。 + +2. 如果仅提供一个分支名称(例如 "fork/feature"),那么另一分支名称就是"main",然后和提供两个分支名称一样处理。 + +3. 如果提供的是一个PR号,那么两个分支分别为 "origin/pull/{PR号}/head" 和 "main",然后和提供两个分支名称一样处理。 + + +## 2. 高层次摘要,需要输出 + +对评审内容用 2–3 句话概括描述: + +* **产品影响**:这项变更对用户或客户带来了什么价值? + +* **工程实现方式**:使用了哪些关键数据结构、算法、模式、框架或最佳实践? + +## 3. PR代码理解 + +请以代码作者视角向Reviewer解释当前PR想干什么。必要时使用mermaid画出关键逻辑、数据结构和交互时序图。 +首先,从全局视角梳理这个PR涉及到数据采集、处理、发送的整体流程(不涉及的部分无需说明),关键组件数据流怎么串联的,用的什么数据结构。 +然后,说明这个PR想扩展什么,应该怎么扩展。 +最后,详解PR实际怎么做的,包括解析、错误处理、重试等关键逻辑。 + +## 4. 牢记评估标准,无需输出 + +针对每个有变更的文件及其差异块,评估这些行是否符合以下方面的要求: + +1. **业务逻辑深度理解** + + * 分析组件的实际作用和预期行为 + + * 识别可能导致功能失效的边缘情况 + + * 质疑现有的设计是否满足业务目标 + + * 考虑故障模式和容错机制 + +2. **设计与架构** + + * 模块职责:确保单一职责原则,检查设计是否符合 SOLID 原则,将可测试性作为重要标准 + + * 依赖管理:识别组件间的调用链和依赖关系,检查是否存在循环依赖或隐含依赖 + + * 分析故障传播路径,确保故障的上下文信息正确 + + * Input和Flusher采用总线Runner模式,配置通过注册应用,线程数不随配置数量增加 + + * 自监控涉及重启的功能应该由LogtailMonitor统一管理 + +3. **正确性与安全** + + * 边界检查:数组/容器访问前验证索引,如`if (index < container.size())` + + * 空指针防护:公共方法必须检查指针参数,如`if (!ptr) return false;` + + * 类型安全:JSON解析先验证类型,如`if (json.isString()) value = json.asString();` + + * 资源管理:使用RAII和智能指针,避免内存泄漏,如`std::unique_ptr`、`std::shared_ptr`。优先使用现成的RAII封装,如需自定义清理逻辑可使用unique\_ptr + lambda构建。 + + * 错误处理:外部输入防御式编程,包括读配置(如`std::ios_base::failure`、`std::filesystem::filesystem_error`、`boost::regex_error`)、文件、数据库、网络,必须有异常处理和完备日志 + + * 错误传播:检查错误是否正确传播到上层,避免静默失败 + + * 外部接口调用容错:对外部API调用、网络请求等失败场景,必须实现指数退避重试机制,避免因瞬时故障导致外部接口过载。 + + * 类型转换:检查类型转换的安全性,特别是缩窄转换(narrowing conversion) + +4. **性能与效率** + + * 内存优化: + + * 容器预分配大小,如`vector.reserve(expected_size)` + + * 避免不必要拷贝,优先移动语义和引用传递,如`map.emplace(args)`,`auto& val = map[key]`。 + + * 字符串操作优先使用 `StringView`数据结构避免复制,优先使用core/common/StringTools.h已有的工具函数如,字符串切分`StringViewSplitter`,字符串修剪`Trim`,字符串解析`StringTo`。 + + * 限制容器最大大小防止内存爆炸,如`if (queue.size() > MAX_QUEUE_SIZE)` + + * 计算效率: + + * 缓存重复计算结果,避免热点路径中的重复工作,例如通过sysconf获取的值仅需在初始化时获取一次。 + + * 确保已使用业界最优的数据结构和算法,尽量避免非线性性能退化 + + * 批处理操作减少系统、网络调用开销,如批量发送 + + * 热路径性能审查: + + * 特别关注循环内部、事件处理循环中的性能变化 + + * 对比新旧实现的时间复杂度差异 + + * 质疑任何在高频路径中引入额外数据结构查找的变更 + + * 主机监控指标:添加指标应该在SystemInterface中同时添加缓存,确保同一时间点获取的指标一致。 + +5. **并发与线程安全** + + * 锁策略:最小化锁范围,优先无锁数据结构如`boost::concurrent_flat_map` + + * 死锁预防:多锁时统一加锁顺序,避免嵌套锁 + + * 线程复用:使用线程池而非频繁创建线程 + + * 事件驱动:IO操作优先考虑事件驱动而非多线程 + + * 数据竞争:共享数据必须同步保护,原子操作优于锁 + + * 异步数据高效传递,例如优先使用epoll的`event.data.ptr`,curl的`CURLOPT_PRIVATE`直接携带上下文数据。 + + * 新增线程:应使用`std::future`、`std::mutex`、`std::condition_variable`配套模式,以便快速停止,参考core/common/timer/Timer.h。 + +6. **动态链接库** + + * 使用core/common/DynamicLibHelper.cpp中定义的工具加载动态链接库,避免直接依赖导致的兼容性问题。 + + * 动态链接库中的代码中不允许自己分配线程资源,必须由主程序控制。 + + * 动态链接库中的内存申请和释放方法必须配对,不允许跨主程序和动态链接库进行内存申请和释放。 + +7. **可读性与规范** + + * 标准: + + * 复用C++17标准库,避免重复轮子 + + * 尽可能使用`constexpr`、`auto`、范围for循环(`for (auto& elem : container) {}`) + + * 使用`std::optional`安全地表示可能为空的返回值,使用`std::variant`处理几种固定不同类型的值。 + + * 调用linter工具,发现违反规范的新增代码 + + * 命名约定: + + * 类名PascalCase:`InputContainerStdio` + + * 成员变量m前缀:`mProject`, `mLogstore` + + * 常量变量k前缀:`kMaxSendLogGroupSize` + + * 代码组织: + + * 保持控制流简洁,降低圈复杂度,抽象重复逻辑(DRY原则),将密集逻辑重构为可测试的辅助方法 + + * 彻底移除无用或不可达代码,包括注释掉的废弃代码。 + + * 魔法数字抽成常量或gflag。 + + * 优先使用结构体数组,而不是平行的多个数组。 + + * 变量和方法应该声明在header文件中,实现在cpp文件,除非是模版类或者有强烈的inline需要。 + + * 避免全局变量,应该使用类、命名空间进行范围限定。 + + * 注释质量: + + * 解释"为什么"而非"什么",复杂算法必须注释 + + * 对代码修改附近的注释检查注释是否需要同步修改 + + * 禁止使用不安全的C函数,例如`strcpy`, `strcat`, `strcmp`, `strlen`, `strchr`, `strrchr`, `strstr`, `sprintf`, `strtok`, `sscanf`, `strspn`, `strcspn`, `strpbrk`, `strncat`, `strncmp`, `strncpy`, `strcoll`, `strxfrm`, `strdup`, `strndup` + +8. **稳定性与监控** + + * 容量控制:所有缓冲区/队列设置上限,如`INT32_FLAG(max_send_log_group_size)` + + * 可观测性:缓存大小、延时、丢弃数等关键指标记录,异常情况使用日志记录,导致延时、丢数据的关键异常使用SendAlarm上报远程服务器。同时检查`LOG_INFO`/`LOG_WARNING`/`LOG_ERROR`日志是否有高频调用刷屏的风险。 + + * 自监控指标、告警:参考`../selfmonitor/SKILL.md`中的内容和规范进行检查。 + +9. **兼容性与部署** + + * 平台兼容:路径分隔符、字节序、系统调用差异处理 + + * 向后兼容:配置格式变更需要兼容旧版配置,新增参数应避免改变原有默认行为 + + * 配置默认值:新增配置项必须有合理的默认值,并在文档中说明 + + * 本地状态兼容:禁止使用Protobuf的`TextFormat`,避免新增参数dump后无法读取。旧版本dump的状态文件,新版本应该正常读取恢复。 + +10. **测试与质量** + + * 覆盖策略:单元测试应涵盖成功和失败路径,核心逻辑100%覆盖,边界条件必测 + + * 测试命名准确描述行为。 + + * 性能测试:对性能敏感的代码路径,应提供基准测试(benchmark) + +11. **安全与合规性**: + + * 检查配置和输入验证与清理以防注入攻击。 + + * 检查新增依赖库是否必要,新增时必须将License添加到licenses目录。 + + * 新文件包含Copyright和Apache License声明。 + + * 代码中严禁出现密钥泄露。 + +12. **文档:** + + * 对于新增的input、processor、flusher插件,检查是否新建了对应的使用文档。 + + * 对于改写的input、processor、flusher插件,如果GetXxxParam的参数有改动,需要对应修改使用文档。 + + +## 5. 按评估标准报告问题,需要输出 + +对发现的每个问题请按如下格式输出一个嵌套项: + +```markdown +- 文件: [<路径>:<起始行号>](file://./<路径>#L<起始行号>) + - 问题: <问题本质的一句话总结> + - 建议: <简明的修改建议或代码示例> +``` + +注意在输出行号前再次检索被review代码,确保使用精确的行号,以便 IDE 可以直接跳转。 + +## 6. 亮点总结,需要输出 + +在报告之后,用简短的列表形式总结你在差异中观察到的正面实践或良好实现。 + +整体过程中,请保持礼貌、专业的语气;保持评论尽可能简洁,同时不失清晰;并且确保仅分析真正发生变更的文件。 \ No newline at end of file diff --git a/.claude/skills/riper5-protocol/SKILL.md b/.claude/skills/riper5-protocol/SKILL.md new file mode 100644 index 0000000000..aea0984866 --- /dev/null +++ b/.claude/skills/riper5-protocol/SKILL.md @@ -0,0 +1,71 @@ +--- +name: riper5-protocol +description: RIPER-5 workflow protocol for complex software engineering tasks: Research, Innovate, Plan, Execute, Review. +--- +# RIPER-5 Protocol + +RIPER-5 is a 5-phase workflow designed for complex software engineering tasks: system design, architectural refactoring, bug diagnosis, performance optimization, multi-component integration. + +## Core Principle + +Start every new conversation in RESEARCH mode. Do not jump to solutions. Progress through phases only with explicit signals. + +## Modes + +### Mode 1: RESEARCH `[MODE: RESEARCH]` +**Purpose**: Information collection and deep understanding +**Allowed**: Read files, ask clarifying questions, analyze architecture, identify constraints, create task files +**Forbidden**: Suggestions, implementation, planning, any solution hints +**Output**: Start with `[MODE: RESEARCH]`, then only observations and questions. + +### Mode 2: INNOVATE `[MODE: INNOVATE]` +**Purpose**: Brainstorm potential approaches +**Allowed**: Discuss solution ideas, evaluate pros/cons, explore alternatives, document findings +**Forbidden**: Specific planning, implementation details, writing code, committing to solutions +**Output**: Start with `[MODE: INNOVATE]`, then only possibilities and considerations. + +### Mode 3: PLAN `[MODE: PLAN]` +**Purpose**: Create exhaustive technical specification +**Allowed**: Detailed plans with file paths, function signatures, data structure changes, error handling, dependency management, test approach +**Forbidden**: Any implementation or code writing, even "example code" that could be executed +**Required**: Convert entire plan into a numbered sequential checklist +**Output**: Start with `[MODE: PLAN]`, then only specifications and implementation details. + +### Mode 4: EXECUTE `[MODE: EXECUTE]` +**Purpose**: Implement exactly what was planned in Mode 3 +**Allowed**: Only implement what the approved plan explicitly details, follow checklist exactly, mark completed items, update task progress +**Forbidden**: Any deviation from plan, un-planned improvements, creative additions +**Quality**: Always show full code context, specify language and path, proper error handling +**Deviation**: If any deviation needed, immediately return to PLAN mode +**Entry**: Only enter on explicit "ENTER EXECUTE MODE" command + +### Mode 5: REVIEW `[MODE: REVIEW]` +**Purpose**: Ruthlessly verify implementation matches plan +**Required**: Line-by-line comparison, technical verification, check for bugs/unexpected behavior, verify against original requirements +**Report**: Must state if implementation matches plan exactly or deviates +**Format**: `Detected deviation: [exact description]` or `Implementation matches plan exactly` +**Output**: Start with `[MODE: REVIEW]`, then systematic comparison and clear judgment. + +## Critical Rules + +- Cannot transition between modes without explicit permission +- Must declare current mode at start of every response +- In EXECUTE: must follow plan 100% faithfully +- In REVIEW: must mark even the smallest deviation +- No independent decision authority outside declared mode +- Disable emoji output unless specifically requested +- If no explicit mode transition signal, stay in current mode +- Default: Start in RESEARCH mode + +## Mode Transition Signals + +Only transition on exact signals: +- "ENTER RESEARCH MODE" +- "ENTER INNOVATE MODE" +- "ENTER PLAN MODE" +- "ENTER EXECUTE MODE" +- "ENTER REVIEW MODE" + +**Auto-transitions**: +- If EXECUTE needs plan deviation -> return to PLAN mode +- After all implementation confirmed by user -> move to REVIEW mode diff --git a/.claude/skills/security-check/SKILL.md b/.claude/skills/security-check/SKILL.md new file mode 100644 index 0000000000..e9115105bb --- /dev/null +++ b/.claude/skills/security-check/SKILL.md @@ -0,0 +1,44 @@ +--- +name: security-check +description: Security scanning before commit/push. Checks for sensitive information like API keys and tokens. +--- +# Security Check Rules + +Before committing or pushing code, must check for sensitive information, especially API Keys and access tokens. + +## What to Check + +### API Keys and Access Tokens +- API Keys starting with `sk-` (OpenAI, Anthropic, Alibaba Cloud, etc.) +- Google API Keys starting with `AIzaSy` +- Public keys starting with `pk_` +- Other common API token formats + +## Before Commit + +### Run Check First +Run `bash .claude/skills/security-check/scripts/security_check.sh commit` to check the staging area for sensitive information. If it does NOT output `staging area is clear`, sensitive information was found. + +### If Sensitive Information Found +1. **Immediately delete or replace**: Replace real API Keys with placeholders +2. **Use environment variables**: Move sensitive info to environment variables +3. **Add to .gitignore**: Ensure files with sensitive info are not committed +4. **Must refuse the commit/push action** + +## Before Push + +### Run Check First +Run `bash .claude/skills/security-check/scripts/security_check.sh push` to check each commit for sensitive information. If it does NOT output `all commits are clear`, sensitive information was found. The commit hashes are written to `task/sensitive_commits.txt`. + +### If Sensitive Information Found +1. **Immediately delete or replace**: Replace real API Keys with placeholders +2. **Use environment variables**: Move sensitive info to environment variables +3. **Add to .gitignore**: Ensure files with sensitive info are not committed +4. **Must use the script below to clean history** + +```bash +# Reset based on results in task/sensitive_commits.txt to avoid leaking commits +bash .claude/skills/security-check/scripts/security_reset.sh +``` + +5. **Must refuse the commit/push action** diff --git a/.claude/skills/security-check/scripts/security_check.sh b/.claude/skills/security-check/scripts/security_check.sh new file mode 100755 index 0000000000..455023c35f --- /dev/null +++ b/.claude/skills/security-check/scripts/security_check.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -euo pipefail + +SENSITIVE_PATTERNS="(sk-[a-zA-Z0-9]|AIzaSy[a-zA-Z0-9]|pk_[a-zA-Z0-9]|ghp_[a-zA-Z0-9]|gho_[a-zA-Z0-9]|ghu_[a-zA-Z0-9]|ghs_[a-zA-Z0-9]|ghr_[a-zA-Z0-9])" +MODE="${1:-}" + +if [ "$MODE" != "commit" ] && [ "$MODE" != "push" ]; then + echo "Usage: $0 [commit|push]" + exit 2 +fi + +if [ "$MODE" == "commit" ]; then + # 检查暂存区中的 API Keys + echo "checking staging area" + if git diff --cached --no-prefix | grep '^+' | grep -E "$SENSITIVE_PATTERNS"; then + echo "⚠️ staging area contains SENSITIVE information" + else + echo "✅ staging area is clear" + fi +elif [ "$MODE" == "push" ]; then + # 检查所有要推送的 commit + is_clear=true + upstream=$(git rev-parse --abbrev-ref --symbolic-full-name @{u} 2>/dev/null) || upstream="origin/main" + mkdir -p task + > task/sensitive_commits.txt # 清空文件 + + while read -r commit; do + commit_hash=$(echo "$commit" | cut -d' ' -f1) + echo "checking commit: $commit" + if git show "$commit_hash" --no-commit-id --unified=0 | grep '^+' | grep -E "$SENSITIVE_PATTERNS"; then + echo "⚠️ commit $commit contains SENSITIVE information" + echo "$commit_hash" >> task/sensitive_commits.txt + is_clear=false + fi + echo "---" + done < <(git log "${upstream}"..HEAD --oneline) + if [ "$is_clear" = true ]; then + echo "✅ all commits are clear" + fi +fi \ No newline at end of file diff --git a/.claude/skills/security-check/scripts/security_reset.sh b/.claude/skills/security-check/scripts/security_reset.sh new file mode 100755 index 0000000000..7b93706899 --- /dev/null +++ b/.claude/skills/security-check/scripts/security_reset.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# 智能squash脚本 - 自动检测并清理包含敏感信息的commits +echo "🔍 开始清理包含敏感信息的commits..." + +# 1. 检查task/sensitive_commits.txt文件是否存在且非空 +if [ ! -f "task/sensitive_commits.txt" ] || [ ! -s "task/sensitive_commits.txt" ]; then + echo "❌ 未找到敏感commits列表,请先运行push前检查" + exit 1 +fi + +# 读取敏感commits列表 +readarray -t sensitive_commits < task/sensitive_commits.txt + +# 2. 如果发现敏感信息,进行智能squash +if [ ${#sensitive_commits[@]} -gt 0 ]; then + echo "🚨 发现 ${#sensitive_commits[@]} 个包含敏感信息的commits,开始清理..." + + # 检查工作区是否干净 + git status --porcelain | read -r _ && { + echo "⚠️ 工作区或暂存区有未提交的更改,先进行stash..." + git stash push -u -m "security-cleanup-backup-$(date +%Y%m%d-%H%M%S)" + stashed=true + } || stashed=false + + # 获取要reset的目标commit + # 找到最早的敏感commit(数组最后一个),并获取其父commit + earliest_sensitive="${sensitive_commits[${#sensitive_commits[@]}-1]}" + parent_commit=$(git rev-parse --quiet "${earliest_sensitive}^") + + # 获取所有需要被squash的commits(从最早的敏感commit的parent到HEAD) + if [ -n "$parent_commit" ]; then + commits_to_squash=($(git rev-list --reverse "${parent_commit}..HEAD")) + else + # 如果没有parent,说明最早的敏感commit是root commit + echo "⚠️ 最早的敏感commit是仓库的第一个commit" + commits_to_squash=($(git rev-list --reverse HEAD)) + fi + + if [ ${#commits_to_squash[@]} -eq 0 ]; then + echo "❌ 无法确定要squash的commit范围" + if [ "$stashed" = true ]; then + git stash pop + fi + exit 1 + fi + + # 获取所有要重新提交的commits的信息 + echo "📝 提取所有commit messages..." + all_commit_details="" + main_subject="" + + for commit_hash in "${commits_to_squash[@]}"; do + # 获取commit信息 + subject=$(git log --format=%s -n 1 "$commit_hash") + body=$(git log --format=%b -n 1 "$commit_hash") + + # 主题行用第一个commit的主题 + if [ -z "$main_subject" ]; then + main_subject="$subject" + fi + + subject_marker="$subject" + + # 按GitHub squash格式添加commit详情 + if [ -n "$body" ]; then + all_commit_details="${all_commit_details}* ${subject_marker}\n\n${body}\n\n" + else + all_commit_details="${all_commit_details}* ${subject_marker}\n\n" + fi + done + + # 创建GitHub风格的squash commit message + new_message="${main_subject}\n\n${all_commit_details}" + + # 执行squash + echo "🔄 执行squash操作..." + if [ -n "$parent_commit" ]; then + git reset --soft "$parent_commit" + else + echo "❌ 检测到最早敏感 commit 为 root commit,自动清理会涉及高风险历史重写,已中止。" + echo "请手动执行更安全流程(例如 orphan 分支重建)后再提交。" + if [ "$stashed" = true ]; then + echo "⚠️ 已为你恢复之前的工作区更改。" + git stash pop + fi + exit 1 + fi + + # 显示需要手动清理的文件 + echo "📋 需要手动清理的文件:" + git status --porcelain | grep '^[AM]' | cut -c4- + + echo "" + echo "✅ Squash完成!请执行以下步骤:" + echo "1. 手动清理上述文件中的敏感信息" + echo "2. 运行: git add ." + echo "3. 运行: git commit" + if [ "$stashed" = true ]; then + echo "4. 如需恢复之前的工作区更改: git stash pop" + fi + echo "" + echo "📝 新的commit message预览:" + echo "────────────────────────────────────────" + echo -e "$new_message" + echo "────────────────────────────────────────" +else + echo "✅ 未发现包含敏感信息的commits" +fi diff --git a/.claude/skills/selfmonitor/SKILL.md b/.claude/skills/selfmonitor/SKILL.md new file mode 100644 index 0000000000..b146df7bdc --- /dev/null +++ b/.claude/skills/selfmonitor/SKILL.md @@ -0,0 +1,138 @@ +--- +name: selfmonitor +description: Self-monitoring metrics, alarm code standards for LoongCollector. Read when changes involve metrics, alarms, or observability. +--- +# Self-Monitoring Code Standards + +You are a self-monitoring code quality expert, responsible for ensuring LoongCollector code correctly uses self-monitoring features including metrics, alarms, code style, and implementation logic. + +## Metric Naming Conventions + +### Format + +**Variable name**: `{MODULE}_{METRIC_CONTENT_DESCRIPTION}_{UNIT}` (ALL CAPS) +**Variable content**: `{metric_content_description}_{unit}` (all lowercase) + +Example: +```cpp +const string METRIC_RUNNER_FLUSHER_IN_RAW_SIZE_BYTES = "in_raw_size_bytes"; +``` + +### Module Prefix Categories + +- **`agent_`**: Process-level metrics, describing entire Agent state +- **`pipeline_`**: Pipeline-level metrics, describing data pipeline state +- **`plugin_`**: Plugin-level metrics, describing specific plugin state +- **`component_`**: Component-level metrics, describing internal component state +- **`runner_`**: Runner-level metrics, describing runner state + +### Unit Categories + +#### Counter metrics +- **`_total`**: Cumulative count (default), e.g. `input_records_total`, `send_success_total` + +#### Size metrics +- **`_bytes`**: Bytes, e.g. `input_size_bytes`, `memory_used_bytes` +- **`_mb`**: Megabytes (memory), e.g. `agent_memory_used_mb` + +#### Time metrics +- **`_ms`**: Milliseconds (processing time, latency), e.g. `process_time_ms` +- **`_s`**: Seconds (long intervals), e.g. `uptime_s` + +#### Ratio metrics +- **`_percent`**: Percentage, e.g. `cpu_usage_percent` +- **`_ps`**: Per second (rate), e.g. `send_bytes_ps` + +#### State metrics +- **`_flag`**: Flag (0 or 1), e.g. `enabled_flag` +- **`_state`**: State value, e.g. `register_state` + +### Label Naming Conventions + +**Label Key format**: `METRIC_LABEL_KEY_{description}` + +Common keys: `METRIC_LABEL_KEY_PROJECT`, `METRIC_LABEL_KEY_LOGSTORE`, `METRIC_LABEL_KEY_PIPELINE_NAME`, `METRIC_LABEL_KEY_PLUGIN_TYPE`, `METRIC_LABEL_KEY_PLUGIN_ID`, `METRIC_LABEL_KEY_FILE_NAME`, `METRIC_LABEL_KEY_FILE_DEV`, `METRIC_LABEL_KEY_FILE_INODE`, `METRIC_LABEL_KEY_REGION`, `METRIC_LABEL_KEY_RUNNER_NAME` + +## Alarm Level Conventions + +Based on PR #2319 design, alarm levels: + +| Level | Severity | Description | Typical Scenario | +|-------|----------|-------------|------------------| +| 1 | warning | Single point error, doesn't affect overall flow | Data parse failure; single collection/send failure | +| 2 | error | Affects main flow, risk if not optimized | Queue busy; monitor exceeded; unsuccessful init | +| 3 | critical | Severe impact: config/module unusable; affects agent stability; causes customer loss | Config load failure; unsuccessful module init; data drop; crash | + +### C++ Alarm Usage + +**Correct**: +```cpp +AlarmManager::GetInstance()->SendAlarmWarning(LOGTAIL_CONFIG_ALARM, "配置解析失败"); +AlarmManager::GetInstance()->SendAlarmError(PROCESS_QUEUE_BUSY_ALARM, "处理队列繁忙"); +AlarmManager::GetInstance()->SendAlarmCritical(CATEGORY_CONFIG_ALARM, "配置加载失败"); +``` + +**Wrong**: Don't use old `SendAlarm` interface. + +### Go Alarm Usage + +**Correct**: +```go +logger.Warning(ctx, selfmonitor.CategoryConfigAlarm, "配置解析失败") +logger.Error(ctx, selfmonitor.ProcessQueueBusyAlarm, "处理队列繁忙") +logger.Critical(ctx, selfmonitor.CategoryConfigAlarm, "配置加载失败") +``` + +## Adding New Metrics + +### C++ Steps + +1. **Define metric constants**: Add to `core/monitor/metric_constants/MetricConstants.h` +2. **Create MetricsRecordRef** with labels in Init() +3. **Create metric objects** (CounterPtr, IntGaugePtr) BEFORE commit +4. **Update values** using macros: `ADD_COUNTER()`, `SET_GAUGE()`, `ADD_GAUGE()` + +**Critical**: MetricsRecordRef must create all metric objects BEFORE commit. After commit, no new metrics can be created. Use `IsCommitted()` to check state. If a Gauge default is non-zero, set it once during Init. + +### Go Steps + +1. **Define constants**: Add to `pkg/selfmonitor/metrics_constants_*.go` +2. **Register metrics** in `InitMetricRecord()`: + ```go + p.MetricRecord = p.Config.Context.RegisterMetricRecord(labels) + p.metricCounter = selfmonitor.NewCounterMetricAndRegister(p.MetricRecord, selfmonitor.MetricPluginInEventsTotal) + ``` +3. **Update values**: Check nil before updating. + +## Adding New Alarm Types + +### C++ Steps + +1. Add to `core/monitor/AlarmManager.h` enum `AlarmType` +2. Add to `mMessageType` vector in `AlarmManager.cpp` constructor +3. Use leveled interfaces: `SendAlarmWarning`, `SendAlarmError`, `SendAlarmCritical` + +### Go Steps + +1. Add to `pkg/selfmonitor/alarm_constants.go` +2. Use leveled interfaces: `logger.Warning`, `logger.Error`, `logger.Critical` + +## Best Practices + +1. Create metric objects once during initialization, not per-call +2. Use safe update macros that check for null +3. Choose alarm level matching severity +4. Provide meaningful alarm messages with context +5. Avoid alarm storms - limit frequency of same alarm +6. Metrics should not impact main flow performance + +## Checklist + +Before submitting self-monitoring code: +- [ ] Metric names follow naming convention with correct module prefix and unit +- [ ] Labels follow naming convention +- [ ] Correct alarm level interface used, matching severity +- [ ] No deprecated interfaces used +- [ ] Metrics created once, updated safely +- [ ] Alarm storms avoided +- [ ] Error handling complete diff --git a/.claude/skills/testing-standards/SKILL.md b/.claude/skills/testing-standards/SKILL.md new file mode 100644 index 0000000000..f1c3618e89 --- /dev/null +++ b/.claude/skills/testing-standards/SKILL.md @@ -0,0 +1,99 @@ +--- +name: testing-standards +description: Testing standards for LoongCollector: unit tests, e2e tests, benchmarks. Reference when writing or reviewing tests. +--- +# LoongCollector Testing Standards + +## Test Categories + +### 1. Unit Tests (C++) +- Use Google Test (GTest) / Google Mock +- Place in `core/unittest/` +- Cover success and failure paths +- Core logic must have 100% coverage +- Test boundary conditions explicitly +- Test naming: accurately describe behavior being tested +- Each `core/unittest/*/` directory produces one executable +- Build and run tests from inside `build/` to ensure relative paths and temp files work correctly +- See `.claude/skills/compile/SKILL.md` for build & run instructions + +### 2. Unit Tests (Go) +- Use standard `testing` package +- Table-driven tests for function coverage +- Integration tests via E2E framework + +### 3. E2E Tests +- BDD Godog framework +- Configuration-driven via `.feature` files +- See `.claude/skills/e2e/SKILL.md` for complete guide (design → write → run → debug) + +### 4. Benchmarks +- Required for performance-sensitive code paths +- Compare against baseline versions +- Measure throughput, latency, CPU/Memory usage + +## E2E Test Quick Reference + +### Feature File Structure +``` +@input +Feature: input file + Test input file + + @e2e @host + Scenario: TestInputFileWithRegexSingle + Given {host} environment + Given subcribe data from {sls} with config + """ + enable: true + inputs: + - Type: input_file + """ + When generate {100} regex logs to file {/tmp/loongcollector/regex_single.log}, with interval {100}ms + Then there is {100} logs +``` + +### Behavior Types +| Type | Purpose | +|------|---------| +| `Given` | Setup/prepare test conditions | +| `When` | Trigger test actions (e.g., log generation) | +| `Then` | Verify test results | + +### Environment Tags +- `@host` - Host environment +- `@k8s` - Kubernetes environment +- `@docker-compose` - Docker Compose environment +- `@e2e` - E2E test marker +- `@regression` - Regression test marker + +### Adding New Test Behaviors +1. Write the Go function in the appropriate directory: + - `cleanup/` - Post-test cleanup (auto-executed) + - `control/` - Control operations (init, config) + - `setup/` - Environment setup + - `trigger/` - Data generation + - `verify/` - Result verification +2. Function signature: `func Name(ctx context.Context, params...) (context.Context, error)` +3. Register in `test/e2e_enterprise/main_test.go` via `scenarioInitializer` +4. Use in feature files with `{param}` syntax + +### Strict Rules +- Do NOT change behavior of the method being tested +- Do NOT modify existing test behaviors in engine +- Always start trigger `When begin trigger` BEFORE generating logs +- Only use registered behaviors from `test/engine/steps.go` +- Verify behavior type matches (Given/When/Then) + +### Test Naming +- Format: `Test${FunctionName}${CaseBriefDescription}` +- Examples: `TestInputFileWithBlackListDir`, `TestInputFileWithRegexSingle` +- Must include `@e2e` and environment tags + +## Benchmark Testing + +For performance-sensitive code: +1. Provide baseline comparison +2. Measure: throughput, latency, CPU profile, memory profile +3. Run under realistic load conditions +4. Document methodology and results diff --git a/.cursor/rules/project-knowledge/config-pitfalls.mdc b/.cursor/rules/project-knowledge/config-pitfalls.mdc new file mode 100644 index 0000000000..2aa3c16b0c --- /dev/null +++ b/.cursor/rules/project-knowledge/config-pitfalls.mdc @@ -0,0 +1,41 @@ +--- +description: LoongCollector 采集配置常见陷阱。编写或审查 pipeline config YAML 时参考。 +globs: + - "**/*.feature" + - "**/case.feature" + - "core/config/**" + - "test/e2e/**" +alwaysApply: false +--- +# LoongCollector 采集配置陷阱 + +## ExcutionTimeout 使配置变为一次性(onetime) + +`global.ExcutionTimeout` 存在于配置中时,**整个配置**被标记为 onetime 类型。 +只有注册了 `RegisterOnetimeInputCreator` 的插件才能在 onetime 配置中使用。 + +大部分输入插件(`input_forward`, `input_file`, `input_container_stdio`, `input_prometheus` 等)只注册了 `RegisterContinuousInputCreator`,在 onetime 配置中会报错: + +``` +failed to parse config:unsupported input plugin module:input_forward +``` + +### 判断逻辑 + +``` +global.ExcutionTimeout 存在 + → PipelineConfig::GetExpireTimeIfOneTime → mOnetimeExpireTime 被设置 + → CollectionConfig::IsOnetime() == true + → IsValidNativeInputPlugin(name, true) 在 ONETIME 注册表查找 + → 找不到 → "unsupported input plugin" +``` + +### 支持 onetime 的输入插件 + +查看 `PluginRegistry::LoadStaticPlugins()` 中调用 `RegisterOnetimeInputCreator` 的插件,如 `InputStaticFile`。 + +### 规则 + +- **持续运行的输入插件配置中不要使用 `ExcutionTimeout`** +- E2E 测试不需要 `ExcutionTimeout` 来控制超时,Go test 的 `-timeout` 参数已经提供了保护 +- 如果确实需要一次性采集,使用 `onetime_pipeline_config` 目录 + 支持 onetime 的输入插件 diff --git a/.cursor/skills/compile/SKILL.md b/.cursor/skills/compile/SKILL.md index 1e7c938e62..a9b9c7af02 100644 --- a/.cursor/skills/compile/SKILL.md +++ b/.cursor/skills/compile/SKILL.md @@ -8,44 +8,89 @@ description: Building ### C++ 部分编译方法 -1. 判断是否进行增量编译。如果已有 `build` 目录,并且其中有内容,并且你的修改没有涉及到 CMake 相关文件,那么跳转到第5步进行增量编译。 +**重要:所有 CMake 和 make 命令必须在 `build/` 目录内执行。** -2. 创建编译目录 - -``` bash -mkdir -p build +**前置条件** — 首次编译前需初始化 Git 子模块: +```bash +git submodule update --init --recursive ``` +两个子模块位于 `core/_thirdparty/`: +- `DCGM` — NVIDIA DCGM 头文件 +- `coolbpf` — eBPF 框架 + +如果子模块目录为空,编译会报 `No such file or directory` 错误。 + +#### 编译步骤 -3. 进入编译目录 +1. 判断是否进行增量编译。如果已有 `build` 目录,并且其中有内容,并且你的修改没有涉及到 CMake 相关文件,那么跳转到第 4 步进行增量编译。 -``` bash -cd build +2. 创建并进入编译目录 + +```bash +mkdir -p build && cd build ``` -4. 构建 CMake 命令 +3. 构建 CMake 命令 -``` bash +```bash cmake -DCMAKE_BUILD_TYPE=Debug -DLOGTAIL_VERSION=0.0.1 \ -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ -DCMAKE_CXX_FLAGS="-I/opt/rh/devtoolset-9/root/usr/lib/gcc/x86_64-redhat-linux/9/include -I/opt/logtail -I/opt/logtail_spl" \ - -DBUILD_LOGTAIL=ON -DBUILD_LOGTAIL_UT=ON -DWITHOUTGDB=ON -DENABLE_STATIC_LINK_CRT=ON -DWITHSPL=ON ../core + -DBUILD_LOGTAIL=ON -DBUILD_LOGTAIL_UT=ON -DWITHOUTGDB=ON -DENABLE_STATIC_LINK_CRT=ON -DWITHSPL=OFF ../core ``` -注意其中的几个开关: - - BUILD_LOGTAIL:表示编译 LoongCollector 二进制。必选 - - BUILD_LOGTAIL_UT:表示编译 LoongCollector 单测。仅当你修改了 LoongCollector 单测时才打开。 - - WITHSPL:表示编译 LoongCollector SPL 相关内容。仅当你修改了 LoongCollector SPL 相关文件时才打开。 +关键 CMake 开关: + +| 开关 | 用途 | +|------|------| +| `BUILD_LOGTAIL` | 编译 LoongCollector 二进制。必选。 | +| `BUILD_LOGTAIL_UT` | 编译单元测试。修改了测试代码时打开。 | +| `WITHSPL` | SPL 支持。除非修改了 SPL 相关文件,否则设为 `OFF`。 | -5. 编译 +4. 编译 -``` bash +```bash make -sj$(nproc) ``` -### Go 部分编译方法 +#### C++ 单元测试 + +每个 `core/unittest/*/` 下的测试目录会生成独立的可执行文件。 + +**编译指定测试**(在 `build/` 目录内): +```bash +make yaml_util_unittest app_config_unittest safe_queue_unittest -j$(nproc) +``` + +**运行测试**(在 `build/` 目录内): +```bash +./unittest/common/yaml_util_unittest +./unittest/app_config/app_config_unittest +``` -执行 +测试必须在 `build/` 目录内运行,因为部分测试依赖相对路径加载配置文件。 -``` bash +### Go 部分编译方法 + +```bash make plugin_local ``` + +### Docker 构建 + +```bash +make image +``` + +### 交叉编译 + +ARM64 架构: +```bash +make image ARCH=arm64 +``` + +### 常见问题 + +- 如果 CMake 报缺少依赖,通过 `apt` 或 `yum` 安装 +- 如果链接失败,尝试 `make clean` 后重新构建 +- 需要 SPL 相关功能时,将 `WITHSPL=OFF` 改为 `WITHSPL=ON` diff --git a/.cursor/skills/e2e-develop-guide/SKILL.md b/.cursor/skills/e2e-develop-guide/SKILL.md deleted file mode 100644 index a4defd9409..0000000000 --- a/.cursor/skills/e2e-develop-guide/SKILL.md +++ /dev/null @@ -1,213 +0,0 @@ ---- -name: e2e-develop-guide -description: LoongCollector E2E Testing Framework Development Guide. Use this rule when you need to understand E2E testing framework architecture, develop new test behavior functions, or extend testing capabilities. Contains detailed BDD testing framework usage, test behavior function development and registration workflows, and comprehensive reference of existing testing capabilities. Example: When adding custom test behaviors for new plugins, use this rule to understand how to write and register new test functions. Example: When developing E2E tests, use this rule to understand the e2e framework. ---- -# LoongCollector E2E 本地开发手册 - -## 背景 - -E2E测试采用行为驱动开发(Behavior-Driven Development)的设计思路,通过定义一系列测试行为,并通过配置文件的方式来描述测试场景,从而实现对插件的集成测试。测试引擎会根据配置文件中的内容,正则匹配对应的函数,并解析配置文件中的参数,传递给对应的函数。从而完成自动创建测试环境、启动iLogtail、触发日志生成、验证日志内容等一系列操作,最终输出测试报告。 - -## 开发(编写配置文件) - -对于每一个新的功能,您都需要在 test 不同目录下创建一个新的目录,其中包含了 feature 格式的配置文件。每个配置文件中可以包含多个测试场景,每个测试场景由一个或多个步骤组成。 - -* e2e/test\_cases:开源 Go 插件测试(Docker Compose 环境运行) - -* e2e\_enterprise/test\_cases:商业版测试(主机+K8s 环境运行,静态,不涉及环境变化) - -* e2e\_enterprise/regression:商业版测试(主机+K8s 环境运行,动态,涉及环境变化) - -```protobuf -test - e2e - ... - e2e_enterprise - test_cases - input_file - case.feature - regression - test_cases - ... -``` - -配置文件的基本框架如下所示: - -```plaintext -@input -Feature: input file - Test input file - - @e2e @host - Scenario: TestInputFileWithRegexSingle - Given {host} environment - Given subcribe data from {sls} with config - """ - """ - Given {regex_single} local config as below - """ - enable: true - inputs: - - Type: input_file - FilePaths: - - /tmp/ilogtail/**/regex_single.log* - processors: - - Type: processor_parse_regex_native - SourceKey: content - Regex: (\S+)\s(\w+):(\d+)\s(\S+)\s-\s\[([^]]+)]\s"(\w+)\s(\S+)\s([^"]+)"\s(\d+)\s(\d+)\s"([^"]+)"\s(.*) - Keys: - - mark - - file - - logNo - - ip - - time - - method - - url - - http - - status - - size - - userAgent - - msg - """ - When generate {100} regex logs to file {/tmp/ilogtail/regex_single.log}, with interval {100}ms - Then there is {100} logs - Then the log fields match regex single -``` - -* `Feature`定义了一个测试功能,下面为这个功能的描述信息。在`Feature`下可以定义多个测试场景。 - -* `Scenario`定义了一个测试场景。在`Scenario`下可以定义多个行为。 - -* 行为定义分为三类: - - * `Given`:定义了一些准备测试条件的行为。 - - * `When`:定义了一些触发测试条件的行为。 - - * `Then`:定义了一些验证测试条件的行为。 - - * 行为中使用`{}`作为标识符,该部分内容将作为参数,传递给对应的Go函数。 - -* `@`表示一个tag,在运行测试时,会根据tag的不同,分别运行。除了自定义的tag外,测试框架定义了一些默认的tag: - - * `@e2e`:表示该测试场景为E2E测试。 - - * `@regression`:表示该测试场景为回归测试。 - - * `@host`:表示该测试场景在host环境下运行。 - - * `@k8s`:表示该测试场景在k8s环境下运行。 - - * `@docker-compose`:表示该测试场景在本地启动docker-compose运行 - -现有的测试能力可以参考附录中的表格。如果现有的行为不能满足需求,可以参考后续的"如何添加新的测试行为"添加自己需要的函数。 - -## 如何添加新的测试行为 - -在某些情况下,需要对engine中的测试行为进行拓展,可以参考下面的添加指南。 - -### 1. 编写行为函数 - -如果您需要添加新的行为函数,可以在`engine`目录下添加一个Go函数。不同目录下的行为函数的职责有所不同: - -* `cleanup`:清理测试环境,其中的测试函数会默认在测试结束后执行。无需在配置文件中显式声明使用。 - -* `control`:管控相关的行为函数,如初始化环境、添加配置等。 - -* `setup`:初始化测试环境,并提供远程调用的相关功能。 - -* `trigger`:数据生成相关的行为函数,如生成日志等。 - -* `verify`:验证相关的行为函数,如验证日志数量、验证日志内容等。 - -每个行为函数的接口定义如下所示: - -```go -func LogCount(ctx context.Context, expect int) (context.Context, error) { - // your code here -} -``` - -函数的第一个参数必须为`context.Context`。除此之外,后续可添加任意多个参数。函数的返回值为`context.Context`和`error`,其中`context.Context`为传递给下一个行为函数的参数,`error`为错误信息。一些需要在多个行为函数之间传递的参数,可以通过`context.Context`来传递。 - -```go -return context.WithValue(ctx, key, value), nil -``` - -### 2. 注册行为函数 - -在`test/e2e_enterprise/main_test.go`中,您需要注册您的行为函数。注册函数的格式如下所示: - -```go -func scenarioInitializer(ctx *godog.ScenarioContext) { - // Given - - // When - - // Then - ctx.Then(`^there is \{(\d+)\} logs$`, verify.LogCount) -} -``` - -您需要根据行为的类型,将行为函数注册到对应的位置。在`Given`中注册`setup`中的行为函数,在`When`中注册`trigger`中的行为函数,在`Then`中注册`verify`中的行为函数。`control`中的行为函数比较灵活,根据函数职责不同,注册到不同的类型中。 - -在注册时,您需要定义一个正则表达式,用于匹配配置文件中的行为。在正则表达式中,您可以使用\`{}\`来标识参数,这些参数将会传递给行为函数。例如: - -```go -ctx.Then(`^there is \{(\d+)\} logs$`, verify.LogCount) -``` - -能够从配置文件中匹配到`there is {100} logs`这样的行为,并将`100`作为参数传递给`LogCount`函数。 - -### 3. 在配置文件中使用 - -在 feature 配置文件中,您可以直接使用您定义的行为函数。例如: - -```protobuf -Then there is {100} logs -``` - -在运行测试时,测试框架会根据配置文件中的行为,调用对应的行为函数,并传递参数。 - -## 附录 - -现有测试行为(部分)。可以参考 `test/engine/steps.go` 中的定义,目前支持的测试行为如下: - -| 行为类型 | 模板 | 参数 | 说明 | -| --- | --- | --- | --- | -| Given | ^{(\S+)} environment$ | 环境类型 | 初始化远程测试环境 | -| Given | ^{(.*)} local config as below | 1. 配置名 2. 配置文件内容 | 添加本地配置 | -| Given | ^subcribe data from {(\S+)} with config | 1. 数据源 2. 配置文件内容 | 订阅数据源 | -| When | ^begin trigger | 无 | 触发日志采集,记录开始时间 | -| When | ^generate {(\d+)} regex logs, with interval {(\d+)}ms$ | 1. 生成日志数量 2. 生成日志间隔 | 生成正则文本日志(路径为/tmp/ilogtail/regex\_single.log) | -| When | ^generate {(\d+)} http logs, with interval {(\d+)}ms, url: {(.)}, method: {(.)}, body: | 1. 生成日志数量 2. 生成日志间隔 3. url 4. method 5. body | 生成http日志,发送到LoongCollector input\_http\_server | -| Then | ^there is {(\d+)} logs$ | 日志数量 | 验证日志数量 | -| Then | ^there is at least {(\d+)} logs$ | 日志数量 | 验证日志数量 | -| Then | wait {(\d+)} seconds | 等待时间 | 等待时间 | -| When | ^execute \{(\d+)\} commands \{(.*)\} in sequence | 1.执行次数 2.执行命令 | 在数据来源机器上,执行指定次数的命令 | -| When | ^create the shell script file \{(\S+)\} with the following content | 1.shell文件名称(会自动添加.sh后缀)2.shell文件内容 | 创建shell文件,并赋权可执行 | -| When | delete the shell script file \{(\S+)\} | shell文件名称 | 删除shell文件 | -| When | execute \{(\d+)\} the shell script file \{(\S+)\} in parallel | 1.执行次数 2.执行的shell文件名称 | 执行shell脚本 | - -⚠️注意: - -(1)日志数量验证 - -对于日志数量的验证,最高为100。 - -| Then | ^there is {(\d+)} logs$ | 日志数量 | 验证日志数量 | -| --- | --- | --- | --- | - -如果超过100,请用以下行为函数: - -在查询的时候用count(1),然后在结果验证的时候`Then the log fields match kv`。 - -```plaintext - When query through {* | select count(1) as cnt from e2e where call_name = 'execve' and arguments = 'loongcollector-e2e-test' and binary = '/usr/bin/grep'} - Then wait {10} seconds - Then there is {1} logs - Then the log fields match kv - """ - cnt: "1000" - """ -``` diff --git a/.cursor/skills/e2e-manual/SKILL.md b/.cursor/skills/e2e-manual/SKILL.md deleted file mode 100644 index d019b41dfe..0000000000 --- a/.cursor/skills/e2e-manual/SKILL.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -name: e2e-manual -description: E2E Testing Standards Guide. Use this rule when you need to write or execute E2E tests, including test naming conventions, test workflows, and test behavior templates. This rule should be used for end-to-end integration testing scenarios to ensure test standardization and maintainability. Example: When writing new E2E tests, use this rule to ensure proper test format and workflow compliance. ---- -# e2e测试指南 - -## 测试框架 - -- E2E(BDD Godog) - -## 严格执行 - -- 禁止改变被测试方法的任何行为 -- 禁止更改engine中已有的测试行为 -- 若需要记录日志,一定要在生成日志之前启动触发器`When begin trigger` -- 模板在test/engine/steps.go中注册,请务必使用已经注册的测试行为,不要自己生成,且行为类型一定要对应,例如wait {10} seconds,是Then类型行为,而不是When类型行为,因此一定为`Then wait {10} seconds` - -## 测试命名规范 - -- 每个案例必须使用 `@e2e` 注解标记测试方法 -- 必须标注环境,`@host`:表示该测试场景在host环境下运行;`@k8s`:表示该测试场景在k8s环境下运行;`@docker-compose`:表示该测试场景在本地启动docker-compose运行。 -- 格式:`Test${被测试功能名}${案例简单说明}` -- 示例:`TestInputFileWithBlackListDir`、`TestInputFileWithRegexSingle` - -## 测试最佳实践 - -### 测试流程 - -- 创建测试环境,`Given {host} environment`,其中host代表构建主机环境,daemonset表示构建k8s环境,docker-compose表示容器环境(只有这个三个值,不要出现其他) -- 启动iLogtail,`Given subcribe data from {sls} with config`,订阅器(Subscribe)插件是测试引擎中用于接收数据的组件,sls表示生成输出到logstore。`Given {regex_single} local config as below`,添加本地配置,启动input插件、processor插件,其中regex_single是配置名称,可根据功能进行构造,此配置中确保含有`enable: true`,否则配置无法执行。 -- 触发日志生成,启动日志记录`When begin trigger`,利用现有的框架生成一系列事件,例如`When generate {100} regex logs to file {/tmp/loongcollector/regex_single.log}, with interval {100}ms`为生成文件日志。 -- 验证日志内容,框架会获取采集到的日志,通过查询语句获取确定日之后,根据`Then`测试行为进行日志验证。查询语句例如,`When query through {* | select * from e2e where call_name = 'tcp_connect' and arguments like '%http://www.baidu.com%'}`,查询日志call_name为tcp_connect,arguments包含 - -### 测试重点 - -- 根据功能要求设计查询语句,以便能够达到测试功能的目的 -- 专注于查询的返回值,确保功能符合预期 -- 保持测试的简洁性和可维护性 diff --git a/.cursor/skills/e2e/SKILL.md b/.cursor/skills/e2e/SKILL.md new file mode 100644 index 0000000000..92a8c4df6b --- /dev/null +++ b/.cursor/skills/e2e/SKILL.md @@ -0,0 +1,209 @@ +--- +name: e2e +description: LoongCollector E2E 测试全流程指南:设计、编写、运行和调试。当需要编写新 E2E 测试、运行现有测试、或排查 E2E 测试失败时使用此 skill。 +--- +# LoongCollector E2E 测试指南 + +> 详细步骤模板见 [reference.md](reference.md) | 可复用脚本见 [scripts/](scripts/) + +## 目录 + +1. [概览](#1-概览) +2. [设计测试用例](#2-设计测试用例) +3. [编写测试用例](#3-编写测试用例) +4. [本地运行(docker-compose)](#4-本地运行) +5. [调试](#5-调试) +6. [已知陷阱](#6-已知陷阱) + +--- + +## 1 概览 + +基于 **BDD Godog** 框架,通过 `.feature` 文件描述场景,引擎正则匹配步骤函数并传参。 + +``` +test/e2e/ + test_cases// + case.feature # 场景描述 + docker-compose.yaml # 可选,外部依赖服务 + engine/ + steps.go # 所有可用步骤(权威来源) + setup/ control/ trigger/ verify/ cleanup/ +``` + +**环境 tag**:`@host`、`@k8s`、`@docker-compose`(三选一,加 `@e2e`) + +--- + +## 2 设计测试用例 + +编写 feature 文件前,先确定测试矩阵。按以下维度逐项评估是否需要覆盖: + +### 2.1 场景维度清单 + +| 维度 | 典型场景 | 何时需要 | +|------|----------|----------| +| **基础功能** | 单配置、单数据类型端到端 | 必须 | +| **多数据类型** | logs / metrics / traces 分别验证 | 插件支持多类型时 | +| **多配置共存** | 同时加载多个 pipeline 配置 | 涉及端口/资源竞争时 | +| **配置热加载** | 运行中增/删/改配置 | 持续运行的 input 插件 | +| **配置类型变更** | 从 A 类型切换到 B 类型 | 插件支持多协议/格式时 | +| **反压与恢复** | 下游不可达 → 恢复后数据不丢 | flusher 插件 | +| **外部依赖失效** | 依赖服务重启/不可达 | 有外部依赖时 | +| **大数据量** | 高吞吐压力下不 OOM/不丢数据 | 性能敏感路径 | + +### 2.2 设计产出 + +确定要覆盖的场景后,明确每个 Scenario 的: +- **输入**:什么数据、什么格式、多少条 +- **流经路径**:input → processor → flusher 的具体插件 +- **预期输出**:在哪里验证、验证什么 +- **外部依赖**:需要什么辅助服务(OTel Collector、Kafka 等) + +--- + +## 3 编写测试用例 + +### 3.1 目录结构 + +``` +test/e2e/test_cases/my_feature/ +├── case.feature +├── docker-compose.yaml # 外部依赖 +└── otel-collector-config.yaml # 如果用 OTel Collector +``` + +### 3.2 Feature 文件模板 + +```gherkin +@flusher +Feature: my feature name + Brief description + + @e2e @docker-compose + Scenario: TestMyFeatureLogs + Given {docker-compose} environment + Given {my-config} local config as below + """ + enable: true + inputs: + - Type: input_forward + Protocol: OTLP + Endpoint: "0.0.0.0:4320" + flushers: + - Type: flusher_otlp_native + Endpoint: "otel-collector:4317" + """ + When start docker-compose {my_feature} + Then wait {10} seconds + When generate {1} OTLP {logs} via otelgen to endpoint {loongcollectorC:4320}, protocol {grpc} + Then wait {5} seconds + Then otlp collector received at least {1} logs from file {/tmp/otel-export/logs.json} +``` + +### 3.3 强制规则 + +- 配置中必须含 `enable: true` +- **只使用** `test/engine/steps.go` 中已注册的步骤 +- `wait {N} seconds` 是 **Then** 类型,不是 When +- 命名格式:`Test${功能名}${场景描述}` +- **不要**在持续运行插件的配置中使用 `global.ExcutionTimeout`(见 §6.1) + +### 3.4 扩展步骤 + +如需新步骤,参考 [reference.md §扩展步骤](reference.md) 中的开发和注册流程。 + +--- + +## 4 本地运行 + +### 4.1 前置条件 + +```bash +docker --version && docker compose version +``` + +如修改了 C++ 代码,需重新编译并更新镜像。两种方式: + +**方式一:完整构建**(慢,但保证一致) +```bash +make e2e_image # 从源码构建完整 Docker 镜像 aliyun/loongcollector:0.0.1 +``` + +**方式二:增量更新**(快,适合迭代调试) +```bash +cd build && make -sj$(nproc) && cd .. +# 替换镜像中的二进制 +docker create --name tmp-lc aliyun/loongcollector:0.0.1 +docker cp build/loongcollector tmp-lc:/usr/local/loongcollector/loongcollector +docker commit tmp-lc aliyun/loongcollector:0.0.1 +docker rm tmp-lc +``` + +### 4.2 运行 + +```bash +cd test/e2e + +# 运行整个测试用例(所有 Scenario) +TEST_CASE=flusher_otlp_native go test -v -run "TestE2EOnDockerCompose$" \ + -timeout 600s -count=1 ./... + +# 只运行指定 Scenario +TEST_CASE=flusher_otlp_native go test -v \ + -run "TestE2EOnDockerCompose/TestFlusherOTLPNativeLogs$" \ + -timeout 600s -count=1 ./... +``` + +### 4.3 清理(测试失败后必做) + +可以直接运行脚本 `bash .cursor/skills/e2e/scripts/e2e-cleanup.sh`,或手动执行: + +```bash +docker rm -f $(docker ps -aq) 2>/dev/null +docker network prune -f +rm -rf test/e2e/config test/e2e/onetime_pipeline_config +sudo rm -rf test/e2e/report +rm -f test/e2e/test_cases//testcase-compose.yaml +``` + +--- + +## 5 调试 + +```bash +# 1. 查看容器日志 +docker ps | grep loongcollectorC +docker exec cat /usr/local/loongcollector/log/loongcollector.LOG + +# 2. 检查配置是否加载 +docker exec ls /usr/local/loongcollector/conf/continuous_pipeline_config/local/ + +# 3. 检查端口是否监听 +docker exec ss -tlnp | grep + +# 4. 手动复现 compose 环境 +cd test/e2e/test_cases/ +docker compose -f testcase-compose.yaml up -d +docker compose -f testcase-compose.yaml logs -f loongcollectorC +``` + +--- + +## 6 已知陷阱 + +### 6.1 ExcutionTimeout 使配置变为一次性 + +**绝对不要**在 `input_forward`、`input_file` 等持续插件的配置中使用 `global.ExcutionTimeout`。 + +它会使 `IsOnetime()` 返回 true,导致 `IsValidNativeInputPlugin(name, true)` 在 onetime 注册表中查找,而大部分 input 只注册了 continuous,结果报 `unsupported input plugin`。 + +详见 `.cursor/rules/project-knowledge/config-pitfalls.mdc`。 + +### 6.2 FlusherFile 必须是文件 + +e2e 模板将 `report/default_flusher.json` bind-mount 到容器。若宿主机路径不存在,Docker 会创建为**目录**。已在 `BootController.Start()` 中自动处理。 + +### 6.3 测试间残留 + +多 Scenario 共享进程,`Clean()` 会删除 config/report。异常退出后手动清理(§4.3)。 diff --git a/.cursor/skills/e2e/reference.md b/.cursor/skills/e2e/reference.md new file mode 100644 index 0000000000..d1c662ae35 --- /dev/null +++ b/.cursor/skills/e2e/reference.md @@ -0,0 +1,134 @@ +# E2E 测试详细参考 + +## 可用步骤速查 + +> 权威来源:`test/engine/steps.go` + +### Given(环境准备) + +| 步骤模板 | 说明 | +|----------|------| +| `{docker-compose} environment` | 初始化 docker-compose 环境 | +| `{host} environment` | 初始化主机环境 | +| `{daemonset} environment` | 初始化 K8s 环境 | +| `{name} local config as below` | 写入持续采集配置 | +| `{name} onetime pipeline local config as below` | 写入一次性采集配置 | +| `subcribe data from {sls} with config` | 订阅 SLS 数据源 | +| `loongcollector depends on containers {name}` | 设置容器依赖 | +| `loongcollector container mount {src} to {dst}` | 挂载卷 | +| `loongcollector expose port {host} to {container}` | 暴露端口 | +| `docker-compose boot type {type}` | 设置 boot 类型 | +| `mkdir {path}` | 创建目录 | + +### When(触发动作) + +| 步骤模板 | 说明 | +|----------|------| +| `start docker-compose {case_name}` | 启动 docker-compose 环境 | +| `begin trigger` | 标记触发开始时间(生成日志前必须调用) | +| `generate {N} regex logs to file {path}, with interval {M}ms` | 生成正则日志 | +| `generate {N} json logs to file {path}, with interval {M}ms` | 生成 JSON 日志 | +| `generate {N} apsara logs to file {path}, with interval {M}ms` | 生成 Apsara 日志 | +| `generate {N} OTLP {logs\|metrics\|traces} via otelgen to endpoint {ep}, protocol {grpc\|http}` | 生成 OTLP 数据 | +| `generate {N} http logs, with interval {M}ms, url: {url}, method: {method}, body:` | 生成 HTTP 日志 | +| `execute {N} commands {cmd} in sequence` | 顺序执行命令 | +| `execute {N} commands {cmd} in parallel` | 并行执行命令 | +| `create the shell script file {name} with the following content` | 创建 shell 脚本 | +| `execute {N} the shell script file {name} in parallel` | 并行执行 shell 脚本 | +| `restart agent` | 重启 Agent | +| `force restart agent` | 强制重启 Agent | + +### Then(结果验证) + +| 步骤模板 | 说明 | +|----------|------| +| `there is {N} logs` | 精确验证日志数(上限 100) | +| `there is at least {N} logs` | 最少日志数验证 | +| `there is less than {N} logs` | 最多日志数验证 | +| `the log fields match kv` | KV 字段匹配(文档内容跟 `"""..."""`) | +| `the log fields match as below` | 日志字段模式匹配 | +| `the log tags match kv` | Tag KV 匹配 | +| `the log is in order` | 日志顺序验证 | +| `wait {N} seconds` | 等待 N 秒 | +| `otlp collector received at least {N} (logs\|metrics\|traces) from file {path}` | OTel Collector 数据验证 | + +> 注意:日志数量验证上限 100。超过 100 用 `When query through` + `Then the log fields match kv` 方式。 + +--- + +## 扩展步骤 + +### 1. 编写函数 + +在 `test/engine/` 对应子目录下: + +```go +func MyVerification(ctx context.Context, expected int) (context.Context, error) { + // 实现逻辑 + return ctx, nil +} +``` + +签名要求:第一个参数 `context.Context`,返回 `(context.Context, error)`。 + +### 2. 注册 + +在 `test/engine/steps.go` 中: + +```go +ctx.Then(`^my verification expects \{(\d+)\}$`, verify.MyVerification) +``` + +### 3. 使用 + +```gherkin +Then my verification expects {42} +``` + +--- + +## docker-compose.yaml 示例 + +### OTel Collector(OTLP 测试用) + +```yaml +services: + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + hostname: otel-collector + user: "0:0" + ports: + - "4317" + volumes: + - ./otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml + - ./otel-export:/tmp/otel-export + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:13133/"] + interval: 5s + timeout: 3s + retries: 5 + start_period: 10s +``` + +--- + +## eBPF 进程安全测试示例 + +```gherkin +@e2e @host @ebpf_input +Scenario: TestEBPFProcessSecurityByNormalStart + Given {host} environment + Given subcribe data from {sls} with config + """ + """ + Given {ebpf_process_security_default} local config as below + """ + enable: true + inputs: + - Type: input_process_security + """ + When begin trigger + When execute {1} commands {/bin/echo 1} in sequence + When query through {* | select * from e2e where call_name = 'execve' and binary = '/bin/echo' and arguments = '1'} + Then there is {1} logs +``` diff --git a/.cursor/skills/e2e/scripts/e2e-cleanup.sh b/.cursor/skills/e2e/scripts/e2e-cleanup.sh new file mode 100755 index 0000000000..65502faff3 --- /dev/null +++ b/.cursor/skills/e2e/scripts/e2e-cleanup.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# E2E 测试环境清理脚本 +# 用法: bash .cursor/skills/e2e/scripts/e2e-cleanup.sh [case_name] +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" +E2E_DIR="$REPO_ROOT/test/e2e" +CASE_NAME="${1:-}" + +echo "==> 停止并删除所有 Docker 容器..." +docker rm -f $(docker ps -aq) 2>/dev/null || true + +echo "==> 清理 Docker 网络..." +docker network prune -f 2>/dev/null || true + +echo "==> 清理运行时目录..." +rm -rf "$E2E_DIR/config" "$E2E_DIR/onetime_pipeline_config" +sudo rm -rf "$E2E_DIR/report" 2>/dev/null || rm -rf "$E2E_DIR/report" 2>/dev/null || true + +if [[ -n "$CASE_NAME" ]]; then + CASE_DIR="$E2E_DIR/test_cases/$CASE_NAME" + if [[ -d "$CASE_DIR" ]]; then + echo "==> 清理测试用例 $CASE_NAME..." + rm -f "$CASE_DIR/testcase-compose.yaml" + rm -f "$CASE_DIR/otel-export/"*.json 2>/dev/null || true + fi +else + echo "==> 清理所有测试用例的 testcase-compose.yaml..." + find "$E2E_DIR/test_cases" -name "testcase-compose.yaml" -delete 2>/dev/null || true +fi + +echo "==> 清理完成" diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 7e98b005ca..95f461da61 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -18,9 +18,18 @@ ARG USERNAME=admin ARG USER_PASSWORD USER root +RUN sed -i '/mirrors.aliyuncs.com\|mirrors.cloud.aliyuncs.com/d' /etc/yum.repos.d/CentOS-Base.repo RUN yum -y install openssh-server && \ ssh-keygen -A +# Feature Docker-in-Docker +COPY dind-install.sh /tmp/dind-install.sh +RUN chmod +x /tmp/dind-install.sh && \ + MOBY=false \ + DOCKERDASHCOMPOSEVERSION=none \ + INSTALLDOCKERBUILDX=false \ + /tmp/dind-install.sh + # Create the user COPY .env /tmp/.env COPY authorized_keys /tmp/authorized_keys @@ -51,7 +60,9 @@ RUN cp /opt/logtail/deps/lib/libssl.so.1.0.0 /usr/lib64; \ echo "export PATH=/usr/local/go/bin:/opt/logtail/deps/bin:$PATH" >> /home/$USERNAME/.bashrc; \ su - $USERNAME -c "\ go env -w GO111MODULE=on && \ - go env -w GOPROXY=https://goproxy.cn,direct" + go env -w GOPROXY=https://goproxy.cn,direct" && \ + usermod -aG docker $USERNAME USER $USERNAME +# ENTRYPOINT [ "/usr/local/share/docker-init.sh" ] \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 76caba5e57..a8cbf0cebb 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -9,13 +9,16 @@ "privileged": true, "mounts": [ { "source": "/sys", "target": "/sys", "type": "bind" }, - { "source": "/", "target": "/logtail_host", "type": "bind" } + { "source": "/", "target": "/logtail_host", "type": "bind" }, + { "source": "loongcollector-dind-data", "target": "/var/lib/docker", "type": "volume" } ], "runArgs": [ "--cap-add=SYS_PTRACE", - "--security-opt", "seccomp=unconfined" + "--security-opt", "seccomp=unconfined", + "--privileged" ], "onCreateCommand": "sudo chown -R $(id -un):$(id -gn) /root", + "postStartCommand": "sudo bash /workspaces/loongcollector-github/.devcontainer/start-dind.sh", "postCreateCommand": "sudo /usr/sbin/sshd", "customizations": { "vscode": { diff --git a/.devcontainer/dind-install.sh b/.devcontainer/dind-install.sh new file mode 100644 index 0000000000..d364880676 --- /dev/null +++ b/.devcontainer/dind-install.sh @@ -0,0 +1,1022 @@ +#!/usr/bin/env bash +#------------------------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. +#------------------------------------------------------------------------------------------------------------- +# +# Docs: https://github.com/microsoft/vscode-dev-containers/blob/main/script-library/docs/docker-in-docker.md +# Maintainer: The Dev Container spec maintainers + + +DOCKER_VERSION="${VERSION:-"latest"}" # The Docker/Moby Engine + CLI should match in version +USE_MOBY="${MOBY:-"true"}" +MOBY_BUILDX_VERSION="${MOBYBUILDXVERSION:-"latest"}" +DOCKER_DASH_COMPOSE_VERSION="${DOCKERDASHCOMPOSEVERSION:-"v2"}" #v1, v2 or none +AZURE_DNS_AUTO_DETECTION="${AZUREDNSAUTODETECTION:-"true"}" +DOCKER_DEFAULT_ADDRESS_POOL="${DOCKERDEFAULTADDRESSPOOL:-""}" +USERNAME="${USERNAME:-"${_REMOTE_USER:-"automatic"}"}" +INSTALL_DOCKER_BUILDX="${INSTALLDOCKERBUILDX:-"true"}" +INSTALL_DOCKER_COMPOSE_SWITCH="${INSTALLDOCKERCOMPOSESWITCH:-"false"}" +MICROSOFT_GPG_KEYS_URI="https://packages.microsoft.com/keys/microsoft.asc" +MICROSOFT_GPG_KEYS_ROLLING_URI="https://packages.microsoft.com/keys/microsoft-rolling.asc" +DOCKER_MOBY_ARCHIVE_VERSION_CODENAMES="trixie bookworm buster bullseye bionic focal jammy noble" +DOCKER_LICENSED_ARCHIVE_VERSION_CODENAMES="trixie bookworm buster bullseye bionic focal hirsute impish jammy noble" +DISABLE_IP6_TABLES="${DISABLEIP6TABLES:-false}" + +# Default: Exit on any failure. +set -e + +# Clean up +rm -rf /var/lib/apt/lists/* + +# Setup STDERR. +err() { + echo "(!) $*" >&2 +} + +if [ "$(id -u)" -ne 0 ]; then + err 'Script must be run as root. Use sudo, su, or add "USER root" to your Dockerfile before running this script.' + exit 1 +fi + +################### +# Helper Functions +# See: https://github.com/microsoft/vscode-dev-containers/blob/main/script-library/shared/utils.sh +################### + +# Determine the appropriate non-root user +if [ "${USERNAME}" = "auto" ] || [ "${USERNAME}" = "automatic" ]; then + USERNAME="" + POSSIBLE_USERS=("vscode" "node" "codespace" "$(awk -v val=1000 -F ":" '$3==val{print $1}' /etc/passwd)") + for CURRENT_USER in "${POSSIBLE_USERS[@]}"; do + if id -u ${CURRENT_USER} > /dev/null 2>&1; then + USERNAME=${CURRENT_USER} + break + fi + done + if [ "${USERNAME}" = "" ]; then + USERNAME=root + fi +elif [ "${USERNAME}" = "none" ] || ! id -u ${USERNAME} > /dev/null 2>&1; then + USERNAME=root +fi + +# Package manager update function +pkg_mgr_update() { + case ${ADJUSTED_ID} in + debian) + if [ "$(find /var/lib/apt/lists/* | wc -l)" = "0" ]; then + echo "Running apt-get update..." + apt-get update -y + fi + ;; + rhel) + if [ ${PKG_MGR_CMD} = "microdnf" ]; then + cache_check_dir="/var/cache/yum" + else + cache_check_dir="/var/cache/${PKG_MGR_CMD}" + fi + if [ "$(ls ${cache_check_dir}/* 2>/dev/null | wc -l)" = 0 ]; then + echo "Running ${PKG_MGR_CMD} makecache ..." + ${PKG_MGR_CMD} makecache + fi + ;; + esac +} + +# Checks if packages are installed and installs them if not +check_packages() { + case ${ADJUSTED_ID} in + debian) + if ! dpkg -s "$@" > /dev/null 2>&1; then + pkg_mgr_update + apt-get -y install --no-install-recommends "$@" + fi + ;; + rhel) + if ! rpm -q "$@" > /dev/null 2>&1; then + pkg_mgr_update + ${PKG_MGR_CMD} -y install "$@" + fi + ;; + esac +} + +# Figure out correct version of a three part version number is not passed +find_version_from_git_tags() { + local variable_name=$1 + local requested_version=${!variable_name} + if [ "${requested_version}" = "none" ]; then return; fi + local repository=$2 + local prefix=${3:-"tags/v"} + local separator=${4:-"."} + local last_part_optional=${5:-"false"} + if [ "$(echo "${requested_version}" | grep -o "." | wc -l)" != "2" ]; then + local escaped_separator=${separator//./\\.} + local last_part + if [ "${last_part_optional}" = "true" ]; then + last_part="(${escaped_separator}[0-9]+)?" + else + last_part="${escaped_separator}[0-9]+" + fi + local regex="${prefix}\\K[0-9]+${escaped_separator}[0-9]+${last_part}$" + local version_list="$(git ls-remote --tags ${repository} | grep -oP "${regex}" | tr -d ' ' | tr "${separator}" "." | sort -rV)" + if [ "${requested_version}" = "latest" ] || [ "${requested_version}" = "current" ] || [ "${requested_version}" = "lts" ]; then + declare -g ${variable_name}="$(echo "${version_list}" | head -n 1)" + else + set +e + declare -g ${variable_name}="$(echo "${version_list}" | grep -E -m 1 "^${requested_version//./\\.}([\\.\\s]|$)")" + set -e + fi + fi + if [ -z "${!variable_name}" ] || ! echo "${version_list}" | grep "^${!variable_name//./\\.}$" > /dev/null 2>&1; then + err "Invalid ${variable_name} value: ${requested_version}\nValid values:\n${version_list}" >&2 + exit 1 + fi + echo "${variable_name}=${!variable_name}" +} + +# Use semver logic to decrement a version number then look for the closest match +find_prev_version_from_git_tags() { + local variable_name=$1 + local current_version=${!variable_name} + local repository=$2 + # Normally a "v" is used before the version number, but support alternate cases + local prefix=${3:-"tags/v"} + # Some repositories use "_" instead of "." for version number part separation, support that + local separator=${4:-"."} + # Some tools release versions that omit the last digit (e.g. go) + local last_part_optional=${5:-"false"} + # Some repositories may have tags that include a suffix (e.g. actions/node-versions) + local version_suffix_regex=$6 + # Try one break fix version number less if we get a failure. Use "set +e" since "set -e" can cause failures in valid scenarios. + set +e + major="$(echo "${current_version}" | grep -oE '^[0-9]+' || echo '')" + minor="$(echo "${current_version}" | grep -oP '^[0-9]+\.\K[0-9]+' || echo '')" + breakfix="$(echo "${current_version}" | grep -oP '^[0-9]+\.[0-9]+\.\K[0-9]+' 2>/dev/null || echo '')" + + if [ "${minor}" = "0" ] && [ "${breakfix}" = "0" ]; then + ((major=major-1)) + declare -g ${variable_name}="${major}" + # Look for latest version from previous major release + find_version_from_git_tags "${variable_name}" "${repository}" "${prefix}" "${separator}" "${last_part_optional}" + # Handle situations like Go's odd version pattern where "0" releases omit the last part + elif [ "${breakfix}" = "" ] || [ "${breakfix}" = "0" ]; then + ((minor=minor-1)) + declare -g ${variable_name}="${major}.${minor}" + # Look for latest version from previous minor release + find_version_from_git_tags "${variable_name}" "${repository}" "${prefix}" "${separator}" "${last_part_optional}" + else + ((breakfix=breakfix-1)) + if [ "${breakfix}" = "0" ] && [ "${last_part_optional}" = "true" ]; then + declare -g ${variable_name}="${major}.${minor}" + else + declare -g ${variable_name}="${major}.${minor}.${breakfix}" + fi + fi + set -e +} + +# Function to fetch the version released prior to the latest version +get_previous_version() { + local url=$1 + local repo_url=$2 + local variable_name=$3 + prev_version=${!variable_name} + + output=$(curl -s "$repo_url"); + if echo "$output" | jq -e 'type == "object"' > /dev/null; then + message=$(echo "$output" | jq -r '.message') + + if [[ $message == "API rate limit exceeded"* ]]; then + echo -e "\nAn attempt to find latest version using GitHub Api Failed... \nReason: ${message}" + echo -e "\nAttempting to find latest version using GitHub tags." + find_prev_version_from_git_tags prev_version "$url" "tags/v" + declare -g ${variable_name}="${prev_version}" + fi + elif echo "$output" | jq -e 'type == "array"' > /dev/null; then + echo -e "\nAttempting to find latest version using GitHub Api." + version=$(echo "$output" | jq -r '.[1].tag_name') + declare -g ${variable_name}="${version#v}" + fi + echo "${variable_name}=${!variable_name}" +} + +get_github_api_repo_url() { + local url=$1 + echo "${url/https:\/\/github.com/https:\/\/api.github.com\/repos}/releases" +} + +########################################### +# Start docker-in-docker installation +########################################### + +# Ensure apt is in non-interactive to avoid prompts +export DEBIAN_FRONTEND=noninteractive + +# Source /etc/os-release to get OS info +. /etc/os-release + +# Determine adjusted ID and package manager +if [ "${ID}" = "debian" ] || [ "${ID_LIKE}" = "debian" ]; then + ADJUSTED_ID="debian" + PKG_MGR_CMD="apt-get" + # Use dpkg for Debian-based systems + architecture="$(dpkg --print-architecture 2>/dev/null || uname -m)" +elif [[ "${ID}" = "rhel" || "${ID}" = "fedora" || "${ID}" = "azurelinux" || "${ID}" = "mariner" || "${ID_LIKE}" = *"rhel"* || "${ID_LIKE}" = *"fedora"* || "${ID_LIKE}" = *"azurelinux"* || "${ID_LIKE}" = *"mariner"* ]]; then + ADJUSTED_ID="rhel" + # Determine the appropriate package manager for RHEL-based systems + for pkg_mgr in tdnf dnf microdnf yum; do + if command -v "$pkg_mgr" >/dev/null 2>&1; then + PKG_MGR_CMD="$pkg_mgr" + break + fi + done + + if [ -z "${PKG_MGR_CMD}" ]; then + err "Unable to find a supported package manager (tdnf, dnf, microdnf, yum)" + exit 1 + fi + + architecture="$(rpm --eval '%{_arch}' 2>/dev/null || uname -m)" +else + err "Linux distro ${ID} not supported." + exit 1 +fi + +# Azure Linux specific setup +if [ "${ID}" = "azurelinux" ]; then + VERSION_CODENAME="azurelinux${VERSION_ID}" +fi + +# Prevent attempting to install Moby on Debian trixie (packages removed) +if [ "${USE_MOBY}" = "true" ] && [ "${ID}" = "debian" ] && [ "${VERSION_CODENAME}" = "trixie" ]; then + err "The 'moby' option is not supported on Debian 'trixie' because 'moby-cli' and related system packages have been removed from that distribution." + err "To continue, either set the feature option '\"moby\": false' or use a different base image (for example: 'debian:bookworm' or 'ubuntu-24.04')." + exit 1 +fi + +# Check if distro is supported +if [ "${USE_MOBY}" = "true" ]; then + if [ "${ADJUSTED_ID}" = "debian" ]; then + if [[ "${DOCKER_MOBY_ARCHIVE_VERSION_CODENAMES}" != *"${VERSION_CODENAME}"* ]]; then + err "Unsupported distribution version '${VERSION_CODENAME}'. To resolve, either: (1) set feature option '\"moby\": false' , or (2) choose a compatible OS distribution" + err "Supported distributions include: ${DOCKER_MOBY_ARCHIVE_VERSION_CODENAMES}" + exit 1 + fi + echo "(*) ${VERSION_CODENAME} is supported for Moby installation - setting up Microsoft repository" + elif [ "${ADJUSTED_ID}" = "rhel" ]; then + if [ "${ID}" = "azurelinux" ] || [ "${ID}" = "mariner" ]; then + echo " (*) ${ID} ${VERSION_ID} detected - using Microsoft repositories for Moby packages" + else + echo "RHEL-based system (${ID}) detected - Moby packages may require additional configuration" + fi + fi +else + if [ "${ADJUSTED_ID}" = "debian" ]; then + if [[ "${DOCKER_LICENSED_ARCHIVE_VERSION_CODENAMES}" != *"${VERSION_CODENAME}"* ]]; then + err "Unsupported distribution version '${VERSION_CODENAME}'. To resolve, please choose a compatible OS distribution" + err "Supported distributions include: ${DOCKER_LICENSED_ARCHIVE_VERSION_CODENAMES}" + exit 1 + fi + echo "(*) ${VERSION_CODENAME} is supported for Docker CE installation (supported: ${DOCKER_LICENSED_ARCHIVE_VERSION_CODENAMES}) - setting up Docker repository" + elif [ "${ADJUSTED_ID}" = "rhel" ]; then + + echo "RHEL-based system (${ID}) detected - using Docker CE packages" + fi +fi + +# Install base dependencies +base_packages="curl ca-certificates pigz iptables gnupg2 wget jq" +case ${ADJUSTED_ID} in + debian) + check_packages apt-transport-https $base_packages dirmngr + ;; + rhel) + check_packages $base_packages tar gawk shadow-utils policycoreutils procps-ng systemd-libs systemd-devel + + ;; +esac + +# Install git if not already present +if ! command -v git >/dev/null 2>&1; then + check_packages git +fi + +# Update CA certificates to ensure HTTPS connections work properly +# This is especially important for Ubuntu 24.04 (Noble) and Debian Trixie +# Only run for Debian-based systems (RHEL uses update-ca-trust instead) +if [ "${ADJUSTED_ID}" = "debian" ] && command -v update-ca-certificates > /dev/null 2>&1; then + update-ca-certificates +fi + +# Swap to legacy iptables for compatibility (Debian only) +if [ "${ADJUSTED_ID}" = "debian" ] && type iptables-legacy > /dev/null 2>&1; then + update-alternatives --set iptables /usr/sbin/iptables-legacy + update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy +fi + +# Set up the necessary repositories +if [ "${USE_MOBY}" = "true" ]; then + # Name of open source engine/cli + engine_package_name="moby-engine" + cli_package_name="moby-cli" + + case ${ADJUSTED_ID} in + debian) + # Import key safely and import Microsoft apt repo + { + curl -sSL ${MICROSOFT_GPG_KEYS_URI} + curl -sSL ${MICROSOFT_GPG_KEYS_ROLLING_URI} + } | gpg --dearmor > /usr/share/keyrings/microsoft-archive-keyring.gpg + echo "deb [arch=${architecture} signed-by=/usr/share/keyrings/microsoft-archive-keyring.gpg] https://packages.microsoft.com/repos/microsoft-${ID}-${VERSION_CODENAME}-prod ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/microsoft.list + ;; + rhel) + echo "(*) ${ID} detected - checking for Moby packages..." + + # Check if moby packages are available in default repos + if ${PKG_MGR_CMD} list available moby-engine >/dev/null 2>&1; then + echo "(*) Using built-in ${ID} Moby packages" + else + case "${ID}" in + azurelinux) + echo "(*) Moby packages not found in Azure Linux repositories" + echo "(*) For Azure Linux, Docker CE ('moby': false) is recommended" + err "Moby packages are not available for Azure Linux ${VERSION_ID}." + err "Recommendation: Use '\"moby\": false' to install Docker CE instead." + exit 1 + ;; + mariner) + echo "(*) Adding Microsoft repository for CBL-Mariner..." + # Add Microsoft repository if packages aren't available locally + curl -sSL ${MICROSOFT_GPG_KEYS_URI} | gpg --dearmor > /etc/pki/rpm-gpg/microsoft.gpg + cat > /etc/yum.repos.d/microsoft.repo << EOF +[microsoft] +name=Microsoft Repository +baseurl=https://packages.microsoft.com/repos/microsoft-cbl-mariner-2.0-prod-base/ +enabled=1 +gpgcheck=1 +gpgkey=file:///etc/pki/rpm-gpg/microsoft.gpg +EOF + # Verify packages are available after adding repo + pkg_mgr_update + if ! ${PKG_MGR_CMD} list available moby-engine >/dev/null 2>&1; then + echo "(*) Moby packages not found in Microsoft repository either" + err "Moby packages are not available for CBL-Mariner ${VERSION_ID}." + err "Recommendation: Use '\"moby\": false' to install Docker CE instead." + exit 1 + fi + ;; + *) + err "Moby packages are not available for ${ID}. Please use 'moby': false option." + exit 1 + ;; + esac + fi + ;; + esac +else + # Name of licensed engine/cli + engine_package_name="docker-ce" + cli_package_name="docker-ce-cli" + case ${ADJUSTED_ID} in + debian) + curl -fsSL https://download.docker.com/linux/${ID}/gpg | gpg --dearmor > /usr/share/keyrings/docker-archive-keyring.gpg + echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/${ID} ${VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list + ;; + rhel) + # Docker CE repository setup for RHEL-based systems + setup_docker_ce_repo() { + curl -fsSL https://download.docker.com/linux/centos/gpg > /etc/pki/rpm-gpg/docker-ce.gpg + cat > /etc/yum.repos.d/docker-ce.repo << EOF +[docker-ce-stable] +name=Docker CE Stable +baseurl=https://download.docker.com/linux/centos/9/\$basearch/stable +enabled=1 +gpgcheck=1 +gpgkey=file:///etc/pki/rpm-gpg/docker-ce.gpg +skip_if_unavailable=1 +module_hotfixes=1 +EOF + } + install_azure_linux_deps() { + echo "(*) Installing device-mapper libraries for Docker CE..." + [ "${ID}" != "mariner" ] && ${PKG_MGR_CMD} -y install device-mapper-libs 2>/dev/null || echo "(*) Device-mapper install failed, proceeding" + echo "(*) Installing additional Docker CE dependencies..." + ${PKG_MGR_CMD} -y install libseccomp libtool-ltdl systemd-libs libcgroup tar xz || { + echo "(*) Some optional dependencies could not be installed, continuing..." + } + } + setup_selinux_context() { + if command -v getenforce >/dev/null 2>&1 && [ "$(getenforce 2>/dev/null)" != "Disabled" ]; then + echo "(*) Creating minimal SELinux context for Docker compatibility..." + mkdir -p /etc/selinux/targeted/contexts/files/ 2>/dev/null || true + echo "/var/lib/docker(/.*)? system_u:object_r:container_file_t:s0" >> /etc/selinux/targeted/contexts/files/file_contexts.local 2>/dev/null || true + fi + } + + # Special handling for RHEL Docker CE installation + case "${ID}" in + azurelinux|mariner) + echo "(*) ${ID} detected" + echo "(*) Note: Moby packages work better on Azure Linux. Consider using 'moby': true" + echo "(*) Setting up Docker CE repository..." + + setup_docker_ce_repo + install_azure_linux_deps + + if [ "${USE_MOBY}" != "true" ]; then + echo "(*) Docker CE installation for Azure Linux - skipping container-selinux" + echo "(*) Note: SELinux policies will be minimal but Docker will function normally" + setup_selinux_context + else + echo "(*) Using Moby - container-selinux not required" + fi + ;; + *) + # Standard RHEL/CentOS/Fedora approach + if command -v dnf >/dev/null 2>&1; then + dnf config-manager --add-repo https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo + elif command -v yum-config-manager >/dev/null 2>&1; then + yum-config-manager --add-repo https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo + else + # Manual fallback + setup_docker_ce_repo + fi + ;; + esac + ;; + esac +fi + +# Refresh package database +case ${ADJUSTED_ID} in + debian) + apt-get update + ;; + rhel) + pkg_mgr_update + ;; +esac + +# Soft version matching +if [ "${DOCKER_VERSION}" = "latest" ] || [ "${DOCKER_VERSION}" = "lts" ] || [ "${DOCKER_VERSION}" = "stable" ]; then + # Empty, meaning grab whatever "latest" is in apt repo + engine_version_suffix="" + cli_version_suffix="" +else + case ${ADJUSTED_ID} in + debian) + # Fetch a valid version from the apt-cache (eg: the Microsoft repo appends +azure, breakfix, etc...) + docker_version_dot_escaped="${DOCKER_VERSION//./\\.}" + docker_version_dot_plus_escaped="${docker_version_dot_escaped//+/\\+}" + # Regex needs to handle debian package version number format: https://www.systutorials.com/docs/linux/man/5-deb-version/ + docker_version_regex="^(.+:)?${docker_version_dot_plus_escaped}([\\.\\+ ~:-]|$)" + set +e # Don't exit if finding version fails - will handle gracefully + cli_version_suffix="=$(apt-cache madison ${cli_package_name} | awk -F"|" '{print $2}' | sed -e 's/^[ \t]*//' | grep -E -m 1 "${docker_version_regex}")" + engine_version_suffix="=$(apt-cache madison ${engine_package_name} | awk -F"|" '{print $2}' | sed -e 's/^[ \t]*//' | grep -E -m 1 "${docker_version_regex}")" + set -e + if [ -z "${engine_version_suffix}" ] || [ "${engine_version_suffix}" = "=" ] || [ -z "${cli_version_suffix}" ] || [ "${cli_version_suffix}" = "=" ] ; then + err "No full or partial Docker / Moby version match found for \"${DOCKER_VERSION}\" on OS ${ID} ${VERSION_CODENAME} (${architecture}). Available versions:" + apt-cache madison ${cli_package_name} | awk -F"|" '{print $2}' | grep -oP '^(.+:)?\K.+' + exit 1 + fi + ;; +rhel) + # For RHEL-based systems, use dnf/yum to find versions + docker_version_escaped="${DOCKER_VERSION//./\\.}" + set +e # Don't exit if finding version fails - will handle gracefully + if [ "${USE_MOBY}" = "true" ]; then + available_versions=$(${PKG_MGR_CMD} list --available moby-engine 2>/dev/null | grep -v "Available Packages" | awk '{print $2}' | grep -E "^${docker_version_escaped}" | head -1) + else + available_versions=$(${PKG_MGR_CMD} list --available docker-ce 2>/dev/null | grep -v "Available Packages" | awk '{print $2}' | grep -E "^${docker_version_escaped}" | head -1) + fi + set -e + if [ -n "${available_versions}" ]; then + engine_version_suffix="-${available_versions}" + cli_version_suffix="-${available_versions}" + else + echo "(*) Exact version ${DOCKER_VERSION} not found, using latest available" + engine_version_suffix="" + cli_version_suffix="" + fi + ;; + esac +fi + +# Version matching for moby-buildx +if [ "${USE_MOBY}" = "true" ]; then + if [ "${MOBY_BUILDX_VERSION}" = "latest" ]; then + # Empty, meaning grab whatever "latest" is in apt repo + buildx_version_suffix="" + else + case ${ADJUSTED_ID} in + debian) + buildx_version_dot_escaped="${MOBY_BUILDX_VERSION//./\\.}" + buildx_version_dot_plus_escaped="${buildx_version_dot_escaped//+/\\+}" + buildx_version_regex="^(.+:)?${buildx_version_dot_plus_escaped}([\\.\\+ ~:-]|$)" + set +e + buildx_version_suffix="=$(apt-cache madison moby-buildx | awk -F"|" '{print $2}' | sed -e 's/^[ \t]*//' | grep -E -m 1 "${buildx_version_regex}")" + set -e + if [ -z "${buildx_version_suffix}" ] || [ "${buildx_version_suffix}" = "=" ]; then + err "No full or partial moby-buildx version match found for \"${MOBY_BUILDX_VERSION}\" on OS ${ID} ${VERSION_CODENAME} (${architecture}). Available versions:" + apt-cache madison moby-buildx | awk -F"|" '{print $2}' | grep -oP '^(.+:)?\K.+' + exit 1 + fi + ;; + rhel) + # For RHEL-based systems, try to find buildx version or use latest + buildx_version_escaped="${MOBY_BUILDX_VERSION//./\\.}" + set +e + available_buildx=$(${PKG_MGR_CMD} list --available moby-buildx 2>/dev/null | grep -v "Available Packages" | awk '{print $2}' | grep -E "^${buildx_version_escaped}" | head -1) + set -e + if [ -n "${available_buildx}" ]; then + buildx_version_suffix="-${available_buildx}" + else + echo "(*) Exact buildx version ${MOBY_BUILDX_VERSION} not found, using latest available" + buildx_version_suffix="" + fi + ;; + esac + echo "buildx_version_suffix ${buildx_version_suffix}" + fi +fi + +# Install Docker / Moby CLI if not already installed +if type docker > /dev/null 2>&1 && type dockerd > /dev/null 2>&1; then + echo "Docker / Moby CLI and Engine already installed." +else + case ${ADJUSTED_ID} in + debian) + if [ "${USE_MOBY}" = "true" ]; then + # Install engine + set +e # Handle error gracefully + apt-get -y install --no-install-recommends moby-cli${cli_version_suffix} moby-buildx${buildx_version_suffix} moby-engine${engine_version_suffix} + exit_code=$? + set -e + + if [ ${exit_code} -ne 0 ]; then + err "Packages for moby not available in OS ${ID} ${VERSION_CODENAME} (${architecture}). To resolve, either: (1) set feature option '\"moby\": false' , or (2) choose a compatible OS version (eg: 'ubuntu-24.04')." + exit 1 + fi + + # Install compose + apt-get -y install --no-install-recommends moby-compose || err "Package moby-compose (Docker Compose v2) not available for OS ${ID} ${VERSION_CODENAME} (${architecture}). Skipping." + else + apt-get -y install --no-install-recommends docker-ce-cli${cli_version_suffix} docker-ce${engine_version_suffix} + # Install compose + apt-mark hold docker-ce docker-ce-cli + apt-get -y install --no-install-recommends docker-compose-plugin || echo "(*) Package docker-compose-plugin (Docker Compose v2) not available for OS ${ID} ${VERSION_CODENAME} (${architecture}). Skipping." + fi + ;; + rhel) + if [ "${USE_MOBY}" = "true" ]; then + set +e # Handle error gracefully + ${PKG_MGR_CMD} -y install moby-cli${cli_version_suffix} moby-engine${engine_version_suffix} + exit_code=$? + set -e + + if [ ${exit_code} -ne 0 ]; then + err "Packages for moby not available in OS ${ID} ${VERSION_CODENAME} (${architecture}). To resolve, either: (1) set feature option '\"moby\": false' , or (2) choose a compatible OS version." + exit 1 + fi + + # Install compose + if [ "${DOCKER_DASH_COMPOSE_VERSION}" != "none" ]; then + ${PKG_MGR_CMD} -y install moby-compose || echo "(*) Package moby-compose not available for ${ID} ${VERSION_CODENAME} (${architecture}). Skipping." + fi + else + # Special handling for Azure Linux Docker CE installation + if [ "${ID}" = "azurelinux" ] || [ "${ID}" = "mariner" ]; then + echo "(*) Installing Docker CE on Azure Linux (bypassing container-selinux dependency)..." + + # Use rpm with --force and --nodeps for Azure Linux + set +e # Don't exit on error for this section + ${PKG_MGR_CMD} -y install docker-ce${cli_version_suffix} docker-ce-cli${engine_version_suffix} containerd.io + install_result=$? + set -e + + if [ $install_result -ne 0 ]; then + echo "(*) Standard installation failed, trying manual installation..." + + echo "(*) Standard installation failed, trying manual installation..." + + # Create directory for downloading packages + mkdir -p /tmp/docker-ce-install + + # Download packages manually using curl since tdnf doesn't support download + echo "(*) Downloading Docker CE packages manually..." + + # Get the repository baseurl + repo_baseurl="https://download.docker.com/linux/centos/9/x86_64/stable" + + # Download packages directly + cd /tmp/docker-ce-install + + # Get package names with versions + if [ -n "${cli_version_suffix}" ]; then + docker_ce_version="${cli_version_suffix#-}" + docker_cli_version="${engine_version_suffix#-}" + else + # Get latest version from repository + docker_ce_version="latest" + fi + + echo "(*) Attempting to download Docker CE packages from repository..." + + # Try to download latest packages if specific version fails + if ! curl -fsSL "${repo_baseurl}/Packages/docker-ce-${docker_ce_version}.el9.x86_64.rpm" -o docker-ce.rpm 2>/dev/null; then + # Fallback: try to get latest available version + echo "(*) Specific version not found, trying latest..." + latest_docker=$(curl -s "${repo_baseurl}/Packages/" | grep -o 'docker-ce-[0-9][^"]*\.el9\.x86_64\.rpm' | head -1) + latest_cli=$(curl -s "${repo_baseurl}/Packages/" | grep -o 'docker-ce-cli-[0-9][^"]*\.el9\.x86_64\.rpm' | head -1) + latest_containerd=$(curl -s "${repo_baseurl}/Packages/" | grep -o 'containerd\.io-[0-9][^"]*\.el9\.x86_64\.rpm' | head -1) + + if [ -n "${latest_docker}" ]; then + curl -fsSL "${repo_baseurl}/Packages/${latest_docker}" -o docker-ce.rpm + curl -fsSL "${repo_baseurl}/Packages/${latest_cli}" -o docker-ce-cli.rpm + curl -fsSL "${repo_baseurl}/Packages/${latest_containerd}" -o containerd.io.rpm + else + echo "(*) ERROR: Could not find Docker CE packages in repository" + echo "(*) Please check repository configuration or use 'moby': true" + exit 1 + fi + fi + # Install systemd libraries required by Docker CE + echo "(*) Installing systemd libraries required by Docker CE..." + ${PKG_MGR_CMD} -y install systemd-libs || ${PKG_MGR_CMD} -y install systemd-devel || { + echo "(*) WARNING: Could not install systemd libraries" + echo "(*) Docker may fail to start without these" + } + + # Install with rpm --force --nodeps + echo "(*) Installing Docker CE packages with dependency override..." + rpm -Uvh --force --nodeps *.rpm + + # Cleanup + cd / + rm -rf /tmp/docker-ce-install + + echo "(*) Docker CE installation completed with dependency bypass" + echo "(*) Note: Some SELinux functionality may be limited without container-selinux" + fi + else + # Standard installation for other RHEL-based systems + ${PKG_MGR_CMD} -y install docker-ce${cli_version_suffix} docker-ce-cli${engine_version_suffix} containerd.io + fi + # Install compose + if [ "${DOCKER_DASH_COMPOSE_VERSION}" != "none" ]; then + ${PKG_MGR_CMD} -y install docker-compose-plugin || echo "(*) Package docker-compose-plugin not available for ${ID} ${VERSION_CODENAME} (${architecture}). Skipping." + fi + fi + ;; + esac +fi + +echo "Finished installing docker / moby!" + +docker_home="/usr/libexec/docker" +cli_plugins_dir="${docker_home}/cli-plugins" + +# fallback for docker-compose +fallback_compose(){ + local url=$1 + local repo_url=$(get_github_api_repo_url "$url") + echo -e "\n(!) Failed to fetch the latest artifacts for docker-compose v${compose_version}..." + get_previous_version "${url}" "${repo_url}" compose_version + echo -e "\nAttempting to install v${compose_version}" + curl -fsSL "https://github.com/docker/compose/releases/download/v${compose_version}/docker-compose-linux-${target_compose_arch}" -o ${docker_compose_path} +} + +# If 'docker-compose' command is to be included +if [ "${DOCKER_DASH_COMPOSE_VERSION}" != "none" ]; then + case "${architecture}" in + amd64|x86_64) target_compose_arch=x86_64 ;; + arm64|aarch64) target_compose_arch=aarch64 ;; + *) + echo "(!) Docker in docker does not support machine architecture '$architecture'. Please use an x86-64 or ARM64 machine." + exit 1 + esac + + docker_compose_path="/usr/local/bin/docker-compose" + if [ "${DOCKER_DASH_COMPOSE_VERSION}" = "v1" ]; then + err "The final Compose V1 release, version 1.29.2, was May 10, 2021. These packages haven't received any security updates since then. Use at your own risk." + INSTALL_DOCKER_COMPOSE_SWITCH="false" + + if [ "${target_compose_arch}" = "x86_64" ]; then + echo "(*) Installing docker compose v1..." + curl -fsSL "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-Linux-x86_64" -o ${docker_compose_path} + chmod +x ${docker_compose_path} + + # Download the SHA256 checksum + DOCKER_COMPOSE_SHA256="$(curl -sSL "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-Linux-x86_64.sha256" | awk '{print $1}')" + echo "${DOCKER_COMPOSE_SHA256} ${docker_compose_path}" > docker-compose.sha256sum + sha256sum -c docker-compose.sha256sum --ignore-missing + elif [ "${VERSION_CODENAME}" = "bookworm" ]; then + err "Docker compose v1 is unavailable for 'bookworm' on Arm64. Kindly switch to use v2" + exit 1 + else + # Use pip to get a version that runs on this architecture + check_packages python3-minimal python3-pip libffi-dev python3-venv + echo "(*) Installing docker compose v1 via pip..." + export PYTHONUSERBASE=/usr/local + pip3 install --disable-pip-version-check --no-cache-dir --user "Cython<3.0" pyyaml wheel docker-compose --no-build-isolation + fi + else + compose_version=${DOCKER_DASH_COMPOSE_VERSION#v} + docker_compose_url="https://github.com/docker/compose" + find_version_from_git_tags compose_version "$docker_compose_url" "tags/v" + echo "(*) Installing docker-compose ${compose_version}..." + curl -fsSL "https://github.com/docker/compose/releases/download/v${compose_version}/docker-compose-linux-${target_compose_arch}" -o ${docker_compose_path} || { + echo -e "\n(!) Failed to fetch the latest artifacts for docker-compose v${compose_version}..." + fallback_compose "$docker_compose_url" + } + + chmod +x ${docker_compose_path} + + # Download the SHA256 checksum + DOCKER_COMPOSE_SHA256="$(curl -sSL "https://github.com/docker/compose/releases/download/v${compose_version}/docker-compose-linux-${target_compose_arch}.sha256" | awk '{print $1}')" + echo "${DOCKER_COMPOSE_SHA256} ${docker_compose_path}" > docker-compose.sha256sum + sha256sum -c docker-compose.sha256sum --ignore-missing + + mkdir -p ${cli_plugins_dir} + cp ${docker_compose_path} ${cli_plugins_dir} + fi +fi + +# fallback method for compose-switch +fallback_compose-switch() { + local url=$1 + local repo_url=$(get_github_api_repo_url "$url") + echo -e "\n(!) Failed to fetch the latest artifacts for compose-switch v${compose_switch_version}..." + get_previous_version "$url" "$repo_url" compose_switch_version + echo -e "\nAttempting to install v${compose_switch_version}" + curl -fsSL "https://github.com/docker/compose-switch/releases/download/v${compose_switch_version}/docker-compose-linux-${target_switch_arch}" -o /usr/local/bin/compose-switch +} +# Install docker-compose switch if not already installed - https://github.com/docker/compose-switch#manual-installation +if [ "${INSTALL_DOCKER_COMPOSE_SWITCH}" = "true" ] && ! type compose-switch > /dev/null 2>&1; then + if type docker-compose > /dev/null 2>&1; then + echo "(*) Installing compose-switch..." + current_compose_path="$(command -v docker-compose)" + target_compose_path="$(dirname "${current_compose_path}")/docker-compose-v1" + compose_switch_version="latest" + compose_switch_url="https://github.com/docker/compose-switch" + # Try to get latest version, fallback to known stable version if GitHub API fails + set +e + find_version_from_git_tags compose_switch_version "$compose_switch_url" + if [ $? -ne 0 ] || [ -z "${compose_switch_version}" ] || [ "${compose_switch_version}" = "latest" ]; then + echo "(*) GitHub API rate limited or failed, using fallback method" + fallback_compose-switch "$compose_switch_url" + fi + set -e + + # Map architecture for compose-switch downloads + case "${architecture}" in + amd64|x86_64) target_switch_arch=amd64 ;; + arm64|aarch64) target_switch_arch=arm64 ;; + *) target_switch_arch=${architecture} ;; + esac + curl -fsSL "https://github.com/docker/compose-switch/releases/download/v${compose_switch_version}/docker-compose-linux-${target_switch_arch}" -o /usr/local/bin/compose-switch || fallback_compose-switch "$compose_switch_url" + chmod +x /usr/local/bin/compose-switch + # TODO: Verify checksum once available: https://github.com/docker/compose-switch/issues/11 + # Setup v1 CLI as alternative in addition to compose-switch (which maps to v2) + mv "${current_compose_path}" "${target_compose_path}" + update-alternatives --install ${docker_compose_path} docker-compose /usr/local/bin/compose-switch 99 + update-alternatives --install ${docker_compose_path} docker-compose "${target_compose_path}" 1 + else + err "Skipping installation of compose-switch as docker compose is unavailable..." + fi +fi + +# If init file already exists, exit +if [ -f "/usr/local/share/docker-init.sh" ]; then + echo "/usr/local/share/docker-init.sh already exists, so exiting." + # Clean up + rm -rf /var/lib/apt/lists/* + exit 0 +fi +echo "docker-init doesn't exist, adding..." + +if ! cat /etc/group | grep -e "^docker:" > /dev/null 2>&1; then + groupadd -r docker +fi + +usermod -aG docker ${USERNAME} + +# fallback for docker/buildx +fallback_buildx() { + local url=$1 + local repo_url=$(get_github_api_repo_url "$url") + echo -e "\n(!) Failed to fetch the latest artifacts for docker buildx v${buildx_version}..." + get_previous_version "$url" "$repo_url" buildx_version + buildx_file_name="buildx-v${buildx_version}.linux-${target_buildx_arch}" + echo -e "\nAttempting to install v${buildx_version}" + wget https://github.com/docker/buildx/releases/download/v${buildx_version}/${buildx_file_name} +} + +if [ "${INSTALL_DOCKER_BUILDX}" = "true" ]; then + buildx_version="latest" + docker_buildx_url="https://github.com/docker/buildx" + find_version_from_git_tags buildx_version "$docker_buildx_url" "refs/tags/v" + echo "(*) Installing buildx ${buildx_version}..." + + # Map architecture for buildx downloads + case "${architecture}" in + amd64|x86_64) target_buildx_arch=amd64 ;; + arm64|aarch64) target_buildx_arch=arm64 ;; + *) target_buildx_arch=${architecture} ;; + esac + + buildx_file_name="buildx-v${buildx_version}.linux-${target_buildx_arch}" + + cd /tmp + wget https://github.com/docker/buildx/releases/download/v${buildx_version}/${buildx_file_name} || fallback_buildx "$docker_buildx_url" + + docker_home="/usr/libexec/docker" + cli_plugins_dir="${docker_home}/cli-plugins" + + mkdir -p ${cli_plugins_dir} + mv ${buildx_file_name} ${cli_plugins_dir}/docker-buildx + chmod +x ${cli_plugins_dir}/docker-buildx + + chown -R "${USERNAME}:docker" "${docker_home}" + chmod -R g+r+w "${docker_home}" + find "${docker_home}" -type d -print0 | xargs -n 1 -0 chmod g+s +fi + +DOCKER_DEFAULT_IP6_TABLES="" +if [ "$DISABLE_IP6_TABLES" == true ]; then + requested_version="" + # checking whether the version requested either is in semver format or just a number denoting the major version + # and, extracting the major version number out of the two scenarios + semver_regex="^(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-([0-9A-Za-z-]+(\.[0-9A-Za-z-]+)*))?(\+([0-9A-Za-z-]+(\.[0-9A-Za-z-]+)*))?$" + if echo "$DOCKER_VERSION" | grep -Eq $semver_regex; then + requested_version=$(echo $DOCKER_VERSION | cut -d. -f1) + elif echo "$DOCKER_VERSION" | grep -Eq "^[1-9][0-9]*$"; then + requested_version=$DOCKER_VERSION + fi + if [ "$DOCKER_VERSION" = "latest" ] || [[ -n "$requested_version" && "$requested_version" -ge 27 ]] ; then + DOCKER_DEFAULT_IP6_TABLES="--ip6tables=false" + echo "(!) As requested, passing '${DOCKER_DEFAULT_IP6_TABLES}'" + fi +fi + +if [ ! -d /usr/local/share ]; then + mkdir -p /usr/local/share +fi + +tee /usr/local/share/docker-init.sh > /dev/null \ +<< EOF +#!/bin/sh +#------------------------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. +#------------------------------------------------------------------------------------------------------------- + +set -e + +AZURE_DNS_AUTO_DETECTION=${AZURE_DNS_AUTO_DETECTION} +DOCKER_DEFAULT_ADDRESS_POOL=${DOCKER_DEFAULT_ADDRESS_POOL} +DOCKER_DEFAULT_IP6_TABLES=${DOCKER_DEFAULT_IP6_TABLES} +EOF + +tee -a /usr/local/share/docker-init.sh > /dev/null \ +<< 'EOF' +dockerd_start="AZURE_DNS_AUTO_DETECTION=${AZURE_DNS_AUTO_DETECTION} DOCKER_DEFAULT_ADDRESS_POOL=${DOCKER_DEFAULT_ADDRESS_POOL} DOCKER_DEFAULT_IP6_TABLES=${DOCKER_DEFAULT_IP6_TABLES} $(cat << 'INNEREOF' + # explicitly remove dockerd and containerd PID file to ensure that it can start properly if it was stopped uncleanly + find /run /var/run -iname 'docker*.pid' -delete || : + find /run /var/run -iname 'container*.pid' -delete || : + + # -- Start: dind wrapper script -- + # Maintained: https://github.com/moby/moby/blob/master/hack/dind + + export container=docker + + if [ -d /sys/kernel/security ] && ! mountpoint -q /sys/kernel/security; then + mount -t securityfs none /sys/kernel/security || { + echo >&2 'Could not mount /sys/kernel/security.' + echo >&2 'AppArmor detection and --privileged mode might break.' + } + fi + + # Mount /tmp (conditionally) + if ! mountpoint -q /tmp; then + mount -t tmpfs none /tmp + fi + + set_cgroup_nesting() + { + # cgroup v2: enable nesting + if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + # move the processes from the root group to the /init group, + # otherwise writing subtree_control fails with EBUSY. + # An error during moving non-existent process (i.e., "cat") is ignored. + mkdir -p /sys/fs/cgroup/init + xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : + # enable controllers + sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \ + > /sys/fs/cgroup/cgroup.subtree_control + fi + } + + # Set cgroup nesting, retrying if necessary + retry_cgroup_nesting=0 + + until [ "${retry_cgroup_nesting}" -eq "5" ]; + do + set +e + set_cgroup_nesting + + if [ $? -ne 0 ]; then + echo "(*) cgroup v2: Failed to enable nesting, retrying..." + else + break + fi + + retry_cgroup_nesting=`expr $retry_cgroup_nesting + 1` + set -e + done + + # -- End: dind wrapper script -- + + # Handle DNS + set +e + cat /etc/resolv.conf | grep -i 'internal.cloudapp.net' > /dev/null 2>&1 + if [ $? -eq 0 ] && [ "${AZURE_DNS_AUTO_DETECTION}" = "true" ] + then + echo "Setting dockerd Azure DNS." + CUSTOMDNS="--dns 168.63.129.16" + else + echo "Not setting dockerd DNS manually." + CUSTOMDNS="" + fi + set -e + + if [ -z "$DOCKER_DEFAULT_ADDRESS_POOL" ] + then + DEFAULT_ADDRESS_POOL="" + else + DEFAULT_ADDRESS_POOL="--default-address-pool $DOCKER_DEFAULT_ADDRESS_POOL" + fi + + # Start docker/moby engine + ( dockerd $CUSTOMDNS $DEFAULT_ADDRESS_POOL $DOCKER_DEFAULT_IP6_TABLES > /tmp/dockerd.log 2>&1 ) & +INNEREOF +)" + +sudo_if() { + COMMAND="$*" + + if [ "$(id -u)" -ne 0 ]; then + sudo $COMMAND + else + $COMMAND + fi +} + +retry_docker_start_count=0 +docker_ok="false" + +until [ "${docker_ok}" = "true" ] || [ "${retry_docker_start_count}" -eq "5" ]; +do + # Start using sudo if not invoked as root + if [ "$(id -u)" -ne 0 ]; then + sudo /bin/sh -c "${dockerd_start}" + else + eval "${dockerd_start}" + fi + + retry_count=0 + until [ "${docker_ok}" = "true" ] || [ "${retry_count}" -eq "5" ]; + do + sleep 1s + set +e + docker info > /dev/null 2>&1 && docker_ok="true" + set -e + + retry_count=`expr $retry_count + 1` + done + + if [ "${docker_ok}" != "true" ] && [ "${retry_docker_start_count}" != "4" ]; then + echo "(*) Failed to start docker, retrying..." + set +e + sudo_if pkill dockerd + sudo_if pkill containerd + set -e + fi + + retry_docker_start_count=`expr $retry_docker_start_count + 1` +done + +# Execute whatever commands were passed in (if any). This allows us +# to set this script to ENTRYPOINT while still executing the default CMD. +exec "$@" +EOF + +chmod +x /usr/local/share/docker-init.sh +chown ${USERNAME}:root /usr/local/share/docker-init.sh + +# Clean up +rm -rf /var/lib/apt/lists/* + +echo 'docker-in-docker-debian script has completed!' diff --git a/.devcontainer/start-dind.sh b/.devcontainer/start-dind.sh new file mode 100755 index 0000000000..586f436a58 --- /dev/null +++ b/.devcontainer/start-dind.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Start Docker-in-Docker with cgroup v1 nesting fix. +# Usage: sudo bash start-dind.sh +set -e + +# 1. Load iptables kernel modules (required by dockerd networking) +modprobe ip_tables iptable_nat iptable_filter 2>/dev/null || true + +# 2. Fix cgroup v1 nesting for DinD +# In a privileged container on cgroup v1, each subsystem shows the full +# host hierarchy. Inner Docker's runc expects the container's own cgroup +# as root. We bind-mount each subsystem to the container's own cgroup dir. +if [ ! -f /sys/fs/cgroup/cgroup.controllers ]; then + SELF_CGROUP_ID=$(grep ':memory:' /proc/1/cgroup | cut -d: -f3 | sed 's|^/docker/||') + if [ -n "$SELF_CGROUP_ID" ]; then + for subsys_dir in /sys/fs/cgroup/*/; do + subsys_name=$(basename "$subsys_dir") + [ -L "/sys/fs/cgroup/$subsys_name" ] && continue + our_dir="$subsys_dir/docker/$SELF_CGROUP_ID" + if [ -d "$our_dir" ]; then + mount --bind "$our_dir" "$subsys_dir" 2>/dev/null || true + fi + done + fi +fi + +# 3. Start Docker daemon via the DinD init script +/usr/local/share/docker-init.sh +sleep 2 + +# 4. Ensure non-root users can access the socket +chmod 666 /var/run/docker.sock 2>/dev/null || true diff --git a/.gitignore b/.gitignore index 1392b48ba4..7df4b01410 100644 --- a/.gitignore +++ b/.gitignore @@ -59,9 +59,7 @@ _deps # Custom /build/ core/build/ -core/protobuf/config_server/*/*.pb.* -core/protobuf/*/*.pb.* -core/log_pb/*.pb.* +*.pb.* core/common/Version.cpp !/Makefile # Enterprise @@ -90,9 +88,11 @@ plugins/all/ *.go.mod.sum # Custom plugin_logger.xml +go_plugin.LOG ### E2E /*-test/ +testcase-compose.yaml ### License find_licenses/ @@ -106,7 +106,10 @@ license_coverage.txt /dist/ /tags/ -### Cursor +### IDE configs /.cursor/ +/.claude/settings.local.json /.claude/ /.gemini/ +.omc/ +/code-review/ diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 2f5657adac..1695046a44 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -160,7 +160,7 @@ if (LINUX) endif() set(SUB_DIRECTORIES_LIST ${SUB_DIRECTORIES_LIST} ebpf ebpf/type ebpf/type/table ebpf/util ebpf/util/sampler ebpf/protocol/http ebpf/protocol/mysql ebpf/protocol/redis ebpf/protocol ebpf/plugin/file_security ebpf/plugin/network_observer ebpf/plugin/process_security ebpf/plugin/network_security ebpf/plugin ebpf/observer ebpf/security prometheus prometheus/labels prometheus/schedulers prometheus/async prometheus/component - host_monitor host_monitor/collector host_monitor/common forward forward/loongsuite + host_monitor host_monitor/collector host_monitor/common forward forward/loongsuite forward/otlp runner/sink/grpc protobuf/opentelemetry ) elseif(MSVC) endif () @@ -171,6 +171,8 @@ endif() # Module includes & set files. include_directories(${CMAKE_CURRENT_SOURCE_DIR}) include_directories("${DEPS_INCLUDE_ROOT}/coolbpf") +# protobuf/ is needed so that generated OTLP proto headers (which use #include "opentelemetry/proto/...") can be resolved +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/protobuf) foreach (DIR_NAME ${SUB_DIRECTORIES_LIST}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/${DIR_NAME}) @@ -181,6 +183,12 @@ foreach (DIR_NAME ${SUB_DIRECTORIES_LIST}) list(APPEND FRAMEWORK_SOURCE_FILES ${TEMP_SOURCE_FILES}) endforeach (DIR_NAME) +# OTLP protobuf files are nested under proto/collector/*/v1/ — use GLOB_RECURSE (Linux-only) +if (LINUX) + file(GLOB_RECURSE OTLP_PROTO_FILES protobuf/opentelemetry/proto/*.c protobuf/opentelemetry/proto/*.cc protobuf/opentelemetry/proto/*.cpp) + list(APPEND FRAMEWORK_SOURCE_FILES ${OTLP_PROTO_FILES}) +endif() + if (ENABLE_ENTERPRISE) # remove several files in shennong/sdk list(REMOVE_ITEM FRAMEWORK_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/shennong/sdk/sample.cpp) @@ -193,6 +201,9 @@ if(MSVC) # remove linux event listener file(GLOB REMOVE_EVENT_LISTENER_SOURCES file_server/event_listener/*_Linux.cpp file_server/event_listener/*_Linux.h) list(REMOVE_ITEM FRAMEWORK_SOURCE_FILES ${REMOVE_EVENT_LISTENER_SOURCES}) + # remove OTLP serializer (depends on protobuf/grpc not available on Windows) + file(GLOB REMOVE_OTLP_SERIALIZER_SOURCES collection_pipeline/serializer/OTLP*.cpp collection_pipeline/serializer/OTLP*.h) + list(REMOVE_ITEM FRAMEWORK_SOURCE_FILES ${REMOVE_OTLP_SERIALIZER_SOURCES}) elseif(UNIX) # remove windows event listener file(GLOB REMOVE_EVENT_LISTENER_SOURCES file_server/event_listener/*_Windows.cpp file_server/event_listener/*_Windows.h) diff --git a/core/application/Application.cpp b/core/application/Application.cpp index 458b447995..cb030b4869 100644 --- a/core/application/Application.cpp +++ b/core/application/Application.cpp @@ -55,6 +55,9 @@ #include "runner/FlusherRunner.h" #include "runner/ProcessorRunner.h" #include "runner/sink/http/HttpSink.h" +#if defined(__linux__) && !defined(__ANDROID__) +#include "runner/sink/grpc/GrpcSink.h" +#endif #include "task_pipeline/TaskPipelineManager.h" #include "task_pipeline/TaskRegistry.h" #ifdef __ENTERPRISE__ @@ -264,6 +267,9 @@ void Application::Start() { // GCOVR_EXCL_START // runner BoundedSenderQueueInterface::SetFeedback(ProcessQueueManager::GetInstance()); HttpSink::GetInstance()->Init(); +#if defined(__linux__) && !defined(__ANDROID__) + GrpcSink::GetInstance()->Init(); +#endif FlusherRunner::GetInstance()->Init(); ProcessorRunner::GetInstance()->Init(); @@ -441,6 +447,9 @@ void Application::Exit() { ContainerManager::GetInstance()->Stop(); FlusherRunner::GetInstance()->Stop(); HttpSink::GetInstance()->Stop(); +#if defined(__linux__) && !defined(__ANDROID__) + GrpcSink::GetInstance()->Stop(); +#endif // TODO: make it common FlusherSLS::RecycleResourceIfNotUsed(); diff --git a/core/collection_pipeline/plugin/PluginRegistry.cpp b/core/collection_pipeline/plugin/PluginRegistry.cpp index 824044dc14..3dc3d1349b 100644 --- a/core/collection_pipeline/plugin/PluginRegistry.cpp +++ b/core/collection_pipeline/plugin/PluginRegistry.cpp @@ -33,7 +33,6 @@ #include "plugin/flusher/sls/FlusherSLS.h" #include "plugin/input/InputContainerStdio.h" #include "plugin/input/InputFile.h" -#include "plugin/input/InputForward.h" #include "plugin/input/InputInternalAlarms.h" #include "plugin/input/InputInternalMatchedContainerInfo.h" #include "plugin/input/InputInternalMetrics.h" @@ -56,7 +55,10 @@ #include "plugin/flusher/kafka/FlusherKafka.h" #endif #if defined(__linux__) && !defined(__ANDROID__) +#include "plugin/flusher/opentelemetry/FlusherOTLPHttpNative.h" +#include "plugin/flusher/opentelemetry/FlusherOTLPNative.h" #include "plugin/input/InputFileSecurity.h" +#include "plugin/input/InputForward.h" #include "plugin/input/InputHostMeta.h" #include "plugin/input/InputHostMonitor.h" #include "plugin/input/InputNetworkObserver.h" @@ -210,6 +212,10 @@ void PluginRegistry::LoadStaticPlugins() { #if defined(__linux__) && !defined(__ENTERPRISE__) RegisterFlusherCreator(new StaticFlusherCreator()); #endif +#if defined(__linux__) && !defined(__ANDROID__) + RegisterFlusherCreator(new StaticFlusherCreator()); + RegisterFlusherCreator(new StaticFlusherCreator()); +#endif #ifdef __ENTERPRISE__ RegisterFlusherCreator(new StaticFlusherCreator()); #endif diff --git a/core/collection_pipeline/serializer/OTLPHttpSerializer.cpp b/core/collection_pipeline/serializer/OTLPHttpSerializer.cpp new file mode 100644 index 0000000000..d8670dbe33 --- /dev/null +++ b/core/collection_pipeline/serializer/OTLPHttpSerializer.cpp @@ -0,0 +1,308 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "collection_pipeline/serializer/OTLPHttpSerializer.h" + +#include "json/writer.h" + +#include "google/protobuf/json/json.h" +#include "google/protobuf/message.h" +#include "models/LogEvent.h" +#include "models/MetricEvent.h" +#include "models/RawEvent.h" +#include "models/SpanEvent.h" +#include "protobuf/opentelemetry/proto/collector/logs/v1/logs_service.pb.h" +#include "protobuf/opentelemetry/proto/collector/metrics/v1/metrics_service.pb.h" +#include "protobuf/opentelemetry/proto/collector/trace/v1/trace_service.pb.h" +#include "protobuf/opentelemetry/proto/common/v1/common.pb.h" +#include "protobuf/opentelemetry/proto/logs/v1/logs.pb.h" +#include "protobuf/opentelemetry/proto/metrics/v1/metrics.pb.h" +#include "protobuf/opentelemetry/proto/resource/v1/resource.pb.h" +#include "protobuf/opentelemetry/proto/trace/v1/trace.pb.h" + +namespace logtail { + +using namespace opentelemetry::proto::collector::logs::v1; +using namespace opentelemetry::proto::collector::metrics::v1; +using namespace opentelemetry::proto::collector::trace::v1; +using namespace opentelemetry::proto::logs::v1; +using namespace opentelemetry::proto::metrics::v1; +using namespace opentelemetry::proto::trace::v1; +using namespace opentelemetry::proto::common::v1; +using namespace opentelemetry::proto::resource::v1; + +using google::protobuf::json::MessageToJsonString; +using google::protobuf::json::PrintOptions; + +static void SetProtoAttribute(KeyValue& kv, const StringView& key, const StringView& value) { + kv.set_key(std::string(key.data(), key.size())); + kv.mutable_value()->set_string_value(std::string(value.data(), value.size())); +} + +static void SetResourceAttributes(Resource& resource, const SizedMap& tags) { + for (const auto& [key, val] : tags.mInner) { + auto* attr = resource.add_attributes(); + SetProtoAttribute(*attr, key, val); + } +} + +static void SerializeRawEvents(const BatchedEvents& batch, ScopeLogs& scopeLogs) { + for (const auto& event : batch.mEvents) { + if (!event.Is()) { + continue; + } + const auto& rawEvent = event.Cast(); + auto* logRecord = scopeLogs.add_log_records(); + + auto ts = rawEvent.GetTimestamp(); + uint64_t timeUnixNano = static_cast(ts) * 1000000000ULL; + auto tsNs = rawEvent.GetTimestampNanosecond(); + if (tsNs.has_value()) { + timeUnixNano += tsNs.value(); + } + logRecord->set_time_unix_nano(timeUnixNano); + + logRecord->mutable_body()->set_string_value( + std::string(rawEvent.GetContent().data(), rawEvent.GetContent().size())); + } +} + +static void SerializeLogEvents(const BatchedEvents& batch, ScopeLogs& scopeLogs) { + for (const auto& event : batch.mEvents) { + if (!event.Is()) { + continue; + } + const auto& logEvent = event.Cast(); + auto* logRecord = scopeLogs.add_log_records(); + + auto ts = logEvent.GetTimestamp(); + uint64_t timeUnixNano = static_cast(ts) * 1000000000ULL; + auto tsNs = logEvent.GetTimestampNanosecond(); + if (tsNs.has_value()) { + timeUnixNano += tsNs.value(); + } + logRecord->set_time_unix_nano(timeUnixNano); + + auto msg = logEvent.GetContent("content"); + if (!msg.empty()) { + logRecord->mutable_body()->set_string_value(std::string(msg.data(), msg.size())); + } + + auto level = logEvent.GetLevel(); + if (!level.empty()) { + logRecord->set_severity_text(std::string(level.data(), level.size())); + } + + for (auto it = logEvent.begin(); it != logEvent.end(); ++it) { + auto* attr = logRecord->add_attributes(); + SetProtoAttribute(*attr, it->first, it->second); + } + } +} + +static void SerializeMetricEvents(const BatchedEvents& batch, ScopeMetrics& scopeMetrics) { + for (const auto& event : batch.mEvents) { + if (!event.Is()) { + continue; + } + const auto& metricEvent = event.Cast(); + auto* metric = scopeMetrics.add_metrics(); + metric->set_name(std::string(metricEvent.GetName().data(), metricEvent.GetName().size())); + + auto* gauge = metric->mutable_gauge(); + auto* dp = gauge->add_data_points(); + + auto ts = metricEvent.GetTimestamp(); + uint64_t timeUnixNano = static_cast(ts) * 1000000000ULL; + auto tsNs = metricEvent.GetTimestampNanosecond(); + if (tsNs.has_value()) { + timeUnixNano += tsNs.value(); + } + dp->set_time_unix_nano(timeUnixNano); + + if (auto* untyped = metricEvent.GetValue()) { + dp->set_as_double(untyped->mValue); + } else { + dp->set_as_double(0.0); + } + + for (auto tagIt = metricEvent.TagsBegin(); tagIt != metricEvent.TagsEnd(); ++tagIt) { + auto* attr = dp->add_attributes(); + SetProtoAttribute(*attr, tagIt->first, tagIt->second); + } + } +} + +static void SerializeSpanEvents(const BatchedEvents& batch, ScopeSpans& scopeSpans) { + for (const auto& event : batch.mEvents) { + if (!event.Is()) { + continue; + } + const auto& spanEvent = event.Cast(); + auto* span = scopeSpans.add_spans(); + + span->set_trace_id(std::string(spanEvent.GetTraceId().data(), spanEvent.GetTraceId().size())); + span->set_span_id(std::string(spanEvent.GetSpanId().data(), spanEvent.GetSpanId().size())); + span->set_parent_span_id(std::string(spanEvent.GetParentSpanId().data(), spanEvent.GetParentSpanId().size())); + span->set_name(std::string(spanEvent.GetName().data(), spanEvent.GetName().size())); + span->set_start_time_unix_nano(spanEvent.GetStartTimeNs()); + span->set_end_time_unix_nano(spanEvent.GetEndTimeNs()); + + switch (spanEvent.GetKind()) { + case SpanEvent::Kind::Internal: + span->set_kind(Span::SPAN_KIND_INTERNAL); + break; + case SpanEvent::Kind::Server: + span->set_kind(Span::SPAN_KIND_SERVER); + break; + case SpanEvent::Kind::Client: + span->set_kind(Span::SPAN_KIND_CLIENT); + break; + case SpanEvent::Kind::Producer: + span->set_kind(Span::SPAN_KIND_PRODUCER); + break; + case SpanEvent::Kind::Consumer: + span->set_kind(Span::SPAN_KIND_CONSUMER); + break; + default: + span->set_kind(Span::SPAN_KIND_UNSPECIFIED); + break; + } + + switch (spanEvent.GetStatus()) { + case SpanEvent::StatusCode::Ok: + span->mutable_status()->set_code(Status::STATUS_CODE_OK); + break; + case SpanEvent::StatusCode::Error: + span->mutable_status()->set_code(Status::STATUS_CODE_ERROR); + break; + default: + span->mutable_status()->set_code(Status::STATUS_CODE_UNSET); + break; + } + + for (auto tagIt = spanEvent.TagsBegin(); tagIt != spanEvent.TagsEnd(); ++tagIt) { + auto* attr = span->add_attributes(); + SetProtoAttribute(*attr, tagIt->first, tagIt->second); + } + } +} + +// Scan batch to detect event types and build the corresponding protobuf message. +// Returns event type index (0=logs, 1=metrics, 2=traces) or -1 if empty. +// Caller owns the returned Message* and must delete it. +static int BuildProtobufMessage(const BatchedEvents& batch, google::protobuf::Message*& msg, std::string& errorMsg) { + bool hasLog = false, hasMetric = false, hasSpan = false, hasRaw = false; + for (const auto& event : batch.mEvents) { + if (event.Is()) + hasLog = true; + else if (event.Is()) + hasMetric = true; + else if (event.Is()) + hasSpan = true; + else if (event.Is()) + hasRaw = true; + } + + if (!hasLog && !hasMetric && !hasSpan && !hasRaw) { + return -1; + } + + if (hasRaw || hasLog) { + auto* logReq = new ExportLogsServiceRequest(); + auto* resourceLogs = logReq->add_resource_logs(); + SetResourceAttributes(*resourceLogs->mutable_resource(), batch.mTags); + auto* scopeLogs = resourceLogs->add_scope_logs(); + if (hasRaw) { + SerializeRawEvents(batch, *scopeLogs); + } + if (hasLog) { + SerializeLogEvents(batch, *scopeLogs); + } + msg = logReq; + return 0; + } + + if (hasMetric) { + auto* metricReq = new ExportMetricsServiceRequest(); + auto* resourceMetrics = metricReq->add_resource_metrics(); + SetResourceAttributes(*resourceMetrics->mutable_resource(), batch.mTags); + auto* scopeMetrics = resourceMetrics->add_scope_metrics(); + SerializeMetricEvents(batch, *scopeMetrics); + msg = metricReq; + return 1; + } + + if (hasSpan) { + auto* traceReq = new ExportTraceServiceRequest(); + auto* resourceSpans = traceReq->add_resource_spans(); + SetResourceAttributes(*resourceSpans->mutable_resource(), batch.mTags); + auto* scopeSpans = resourceSpans->add_scope_spans(); + SerializeSpanEvents(batch, *scopeSpans); + msg = traceReq; + return 2; + } + + return -1; +} + +bool OTLPEventGroupSerializer::SerializeToBinaryString(BatchedEvents&& batch, std::string& res, std::string& errorMsg) { + errorMsg.clear(); + + google::protobuf::Message* msg = nullptr; + int type = BuildProtobufMessage(batch, msg, errorMsg); + if (type < 0) { + return true; + } + + if (!msg->SerializeToString(&res)) { + errorMsg = "Failed to serialize OTLP request to binary protobuf"; + delete msg; + return false; + } + delete msg; + return true; +} + +bool OTLPEventGroupSerializer::Serialize(BatchedEvents&& batch, std::string& res, std::string& errorMsg) { + errorMsg.clear(); + + google::protobuf::Message* msg = nullptr; + int type = BuildProtobufMessage(batch, msg, errorMsg); + if (type < 0) { + return true; + } + + try { + PrintOptions opts; + opts.always_print_fields_with_no_presence = true; + opts.preserve_proto_field_names = true; + + auto status = MessageToJsonString(*msg, &res, opts); + delete msg; + if (!status.ok()) { + errorMsg = "Failed to serialize to OTLP JSON: " + std::string(status.ToString()); + return false; + } + return true; + } catch (const std::exception& e) { + errorMsg = std::string("Exception during OTLP JSON serialization: ") + e.what(); + delete msg; + return false; + } +} + +} // namespace logtail diff --git a/core/collection_pipeline/serializer/OTLPHttpSerializer.h b/core/collection_pipeline/serializer/OTLPHttpSerializer.h new file mode 100644 index 0000000000..ceb65c1d5b --- /dev/null +++ b/core/collection_pipeline/serializer/OTLPHttpSerializer.h @@ -0,0 +1,36 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "collection_pipeline/serializer/Serializer.h" + +namespace logtail { + +class OTLPEventGroupSerializer : public Serializer { +public: + OTLPEventGroupSerializer(Flusher* f) : Serializer(f) {} + + // Serialize to protobuf binary string (for http+protobuf transport). + bool SerializeToBinaryString(BatchedEvents&& p, std::string& res, std::string& errorMsg); + +private: + bool Serialize(BatchedEvents&& p, std::string& res, std::string& errorMsg) override; +}; + +} // namespace logtail diff --git a/core/dependencies.cmake b/core/dependencies.cmake index 07d0a265a8..4a86857a47 100644 --- a/core/dependencies.cmake +++ b/core/dependencies.cmake @@ -135,7 +135,7 @@ logtail_define(protobuf_BIN "Absolute path to protoc" "${DEPS_BINARY_ROOT}/proto function(compile_proto PROTO_PATH OUTPUT_PATH PROTO_FILES) file(MAKE_DIRECTORY ${OUTPUT_PATH}) - execute_process(COMMAND ${protobuf_BIN} + execute_process(COMMAND ${protobuf_BIN} --proto_path=${PROTO_PATH} --cpp_out=${OUTPUT_PATH} ${PROTO_FILES}) @@ -143,7 +143,7 @@ endfunction() function(compile_proto_grpc PROTO_PATH OUTPUT_PATH PROTO_FILES) file(MAKE_DIRECTORY ${OUTPUT_PATH}) - execute_process(COMMAND ${protobuf_BIN} + execute_process(COMMAND ${protobuf_BIN} --plugin=protoc-gen-grpc=${DEPS_BINARY_ROOT}/grpc_cpp_plugin -I=${PROTO_PATH} --cpp_out=${OUTPUT_PATH} @@ -151,6 +151,18 @@ function(compile_proto_grpc PROTO_PATH OUTPUT_PATH PROTO_FILES) ${PROTO_FILES}) endfunction() +compile_proto( + "${CMAKE_CURRENT_SOURCE_DIR}/protobuf" + "${CMAKE_CURRENT_SOURCE_DIR}/protobuf" + "opentelemetry/proto/common/v1/common.proto;opentelemetry/proto/resource/v1/resource.proto;opentelemetry/proto/logs/v1/logs.proto;opentelemetry/proto/metrics/v1/metrics.proto;opentelemetry/proto/trace/v1/trace.proto" +) + +compile_proto_grpc( + "${CMAKE_CURRENT_SOURCE_DIR}/protobuf" + "${CMAKE_CURRENT_SOURCE_DIR}/protobuf" + "opentelemetry/proto/collector/logs/v1/logs_service.proto;opentelemetry/proto/collector/metrics/v1/metrics_service.proto;opentelemetry/proto/collector/trace/v1/trace_service.proto" +) + compile_proto( "${CMAKE_CURRENT_SOURCE_DIR}/protobuf/sls" "${CMAKE_CURRENT_SOURCE_DIR}/protobuf/sls" diff --git a/core/forward/BaseService.h b/core/forward/BaseService.h index 4319f87c7b..daaa9df159 100644 --- a/core/forward/BaseService.h +++ b/core/forward/BaseService.h @@ -16,8 +16,15 @@ #pragma once +#include +#include + #include "json/value.h" +namespace grpc { +class Service; +} + namespace logtail { class BaseService { @@ -29,6 +36,10 @@ class BaseService { virtual bool Remove(std::string configName, const Json::Value& config) = 0; [[nodiscard]] virtual const std::string& Name() const = 0; + // Returns all gRPC services for registration with ServerBuilder. + // Override in derived classes to return a list of gRPC Service objects. + virtual std::vector<::grpc::Service*> GetGrpcServices() { return {}; } + protected: std::string mAddress; }; diff --git a/core/forward/GrpcInputManager.cpp b/core/forward/GrpcInputManager.cpp index 149e745b03..479bbe607e 100644 --- a/core/forward/GrpcInputManager.cpp +++ b/core/forward/GrpcInputManager.cpp @@ -33,6 +33,7 @@ #include "common/Flags.h" #include "common/StringTools.h" #include "forward/loongsuite/LoongSuiteForwardService.h" +#include "forward/otlp/OTLPForwardService.h" #include "logger/Logger.h" #ifdef APSARA_UNIT_TEST_MAIN #include "unittest/forward/MockServiceImpl.h" @@ -134,8 +135,14 @@ bool GrpcInputManager::AddListenInput(const std::string& configName, factories.emplace_back(std::make_unique(it->second.mInFlightCnt)); builder.experimental().SetInterceptorCreators(std::move(factories)); builder.AddListeningPort(address, grpc::InsecureServerCredentials()); - // TODO: multi-service server is complex and lacks isolation, only support one service per server for now - builder.RegisterService(service.get()); + // Register all gRPC services (e.g. OTLP has Logs + Metrics + Traces as separate services). + auto grpcServices = service->GetGrpcServices(); + for (auto* grpcSvc : grpcServices) { + builder.RegisterService(grpcSvc); + } + LOG_INFO(sLogger, + ("GrpcInputManager", "registered gRPC services")("address", address)("service", service->Name())( + "serviceCount", grpcServices.size())); auto server = builder.BuildAndStart(); if (!server) { LOG_ERROR(sLogger, @@ -219,6 +226,8 @@ bool GrpcInputManager::ShutdownGrpcServer(grpc::Server* server, std::shared_ptr< template bool GrpcInputManager::AddListenInput(const std::string&, const std::string&, const Json::Value&); +template bool +GrpcInputManager::AddListenInput(const std::string&, const std::string&, const Json::Value&); #ifdef APSARA_UNIT_TEST_MAIN template bool diff --git a/core/forward/loongsuite/LoongSuiteForwardService.h b/core/forward/loongsuite/LoongSuiteForwardService.h index 3f754a1284..be096559f8 100644 --- a/core/forward/loongsuite/LoongSuiteForwardService.h +++ b/core/forward/loongsuite/LoongSuiteForwardService.h @@ -16,11 +16,11 @@ #pragma once -#include #include #include #include #include +#include #include "collection_pipeline/queue/QueueKey.h" #include "forward/BaseService.h" @@ -84,7 +84,8 @@ class LoongSuiteForwardServiceImpl : public BaseService, public LoongSuiteForwar bool Update(std::string configName, const Json::Value& config) override; bool Remove(std::string configName, const Json::Value& config) override; - [[nodiscard]] const std::string& Name() const override { return sName; }; + [[nodiscard]] const std::string& Name() const override { return sName; } + std::vector<::grpc::Service*> GetGrpcServices() override { return {this}; } grpc::ServerUnaryReactor* Forward(grpc::CallbackServerContext* context, const LoongSuiteForwardRequest* request, diff --git a/core/forward/otlp/OTLPForwardService.cpp b/core/forward/otlp/OTLPForwardService.cpp new file mode 100644 index 0000000000..f2e2db96a5 --- /dev/null +++ b/core/forward/otlp/OTLPForwardService.cpp @@ -0,0 +1,490 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "forward/otlp/OTLPForwardService.h" + +#include + +#include + +#include "common/Flags.h" +#include "common/ParamExtractor.h" +#include "common/TimeKeeper.h" +#include "logger/Logger.h" +#include "models/LogEvent.h" +#include "models/MetricEvent.h" +#include "models/MetricValue.h" +#include "models/PipelineEventGroup.h" +#include "models/SpanEvent.h" +#include "runner/ProcessorRunner.h" + +using namespace opentelemetry::proto::collector::logs::v1; +using namespace opentelemetry::proto::collector::metrics::v1; +using namespace opentelemetry::proto::collector::trace::v1; + +namespace logtail { + +const std::string OTLPForwardServiceImpl::sName = "OTLPForwardService"; + +std::vector<::grpc::Service*> OTLPForwardServiceImpl::GetGrpcServices() { + return {mLogsService.get(), mMetricsService.get(), mTraceService.get()}; +} + +OTLPForwardServiceImpl::OTLPForwardServiceImpl(const std::string& address) : BaseService(address) { + mLogsService = std::make_unique(this); + mMetricsService = std::make_unique(this); + mTraceService = std::make_unique(this); + + WriteMetrics::GetInstance()->CreateMetricsRecordRef( + mMetricsRecordRef, + MetricCategory::METRIC_CATEGORY_COMPONENT, + {{METRIC_LABEL_KEY_COMPONENT_NAME, "otlp_forward"}, {METRIC_LABEL_KEY_SERVICE_ADDRESS, address}}); + mLogInEventsTotal = mMetricsRecordRef.CreateCounter(METRIC_COMPONENT_IN_EVENTS_TOTAL); + mLogInSizeBytes = mMetricsRecordRef.CreateCounter(METRIC_COMPONENT_IN_SIZE_BYTES); + mMetricInEventsTotal = mMetricsRecordRef.CreateCounter(METRIC_COMPONENT_IN_EVENTS_TOTAL); + mTraceInEventsTotal = mMetricsRecordRef.CreateCounter(METRIC_COMPONENT_IN_EVENTS_TOTAL); + mDiscardedEventsTotal = mMetricsRecordRef.CreateCounter(METRIC_COMPONENT_DISCARDED_ITEMS_TOTAL); + mTotalDelayMs = mMetricsRecordRef.CreateTimeCounter(METRIC_COMPONENT_TOTAL_DELAY_MS); + WriteMetrics::GetInstance()->CommitMetricsRecordRef(mMetricsRecordRef); +} + +bool OTLPForwardServiceImpl::Update(std::string configName, const Json::Value& config) { + OTLPForwardConfig forwardConfig; + forwardConfig.configName = configName; + + std::string errorMsg; + int32_t queueKey = -1; + if (!GetMandatoryIntParam(config, "QueueKey", queueKey, errorMsg)) { + return false; + } + forwardConfig.queueKey = static_cast(queueKey); + + int inputIndex = -1; + if (!GetMandatoryIntParam(config, "InputIndex", inputIndex, errorMsg)) { + return false; + } + forwardConfig.inputIndex = static_cast(inputIndex); + if (!AddToIndex(configName, std::move(forwardConfig), errorMsg)) { + LOG_ERROR(sLogger, ("Update OTLP forward match rule failed", configName)("error", errorMsg)); + return false; + } + LOG_INFO(sLogger, + ("OTLPForwardServiceImpl config updated", configName)("queueKey", queueKey)("inputIndex", inputIndex)); + return true; +} + +bool OTLPForwardServiceImpl::Remove(std::string configName, const Json::Value& config) { + std::unique_lock lock(mMatchIndexMutex); + auto it = mMatchIndex.find(configName); + if (it != mMatchIndex.end()) { + mMatchIndex.erase(it); + LOG_INFO(sLogger, ("OTLPForwardServiceImpl config removed", configName)); + } + return true; +} + +// ==================== Logs Export ==================== + +grpc::ServerUnaryReactor* OTLPLogsGrpcService::Export(grpc::CallbackServerContext* context, + const ExportLogsServiceRequest* request, + ExportLogsServiceResponse* response) { + auto* reactor = context->DefaultReactor(); + grpc::Status status(grpc::StatusCode::NOT_FOUND, "No matching config"); + + if (!request) { + ADD_COUNTER(mImpl->mDiscardedEventsTotal, 1); + reactor->Finish(grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Invalid request")); + return reactor; + } + + auto before = TimeKeeper::GetInstance()->NowMs(); + ADD_COUNTER(mImpl->mLogInEventsTotal, 1); + + std::shared_ptr config; + if (mImpl->FindMatchingConfig(context, config)) { + mImpl->ProcessLogExport(request, config, status); + } else { + ADD_COUNTER(mImpl->mDiscardedEventsTotal, 1); + } + + ADD_COUNTER(mImpl->mTotalDelayMs, std::chrono::milliseconds(TimeKeeper::GetInstance()->NowMs() - before)); + reactor->Finish(status); + return reactor; +} + +void OTLPForwardServiceImpl::ProcessLogExport(const ExportLogsServiceRequest* request, + const std::shared_ptr& config, + grpc::Status& status) { + int totalEventCount = 0; + size_t totalBytes = request->ByteSizeLong(); + ADD_COUNTER(mLogInSizeBytes, totalBytes); + bool allQueued = true; + + // TODO: one OTLP request with N resources produces N PipelineEventGroups. + // Partial enqueue failure returns UNAVAILABLE which may cause the sender to + // retry the entire request, leading to duplicates for already-enqueued groups. + for (const auto& resourceLogs : request->resource_logs()) { + auto eventGroup = PipelineEventGroup(std::make_shared()); + int eventCount = 0; + + // resource attributes as real tags on eventGroup + for (const auto& attr : resourceLogs.resource().attributes()) { + if (attr.value().has_string_value()) { + eventGroup.SetTag(attr.key(), attr.value().string_value()); + } + } + + for (const auto& scopeLogs : resourceLogs.scope_logs()) { + for (const auto& logRecord : scopeLogs.log_records()) { + auto* logEvent = eventGroup.AddLogEvent(true); + + uint64_t timeUnixNano = logRecord.time_unix_nano(); + logEvent->SetTimestamp(timeUnixNano / 1000000000, static_cast(timeUnixNano % 1000000000)); + + if (logRecord.has_body() && logRecord.body().has_string_value()) { + logEvent->SetContent("content", logRecord.body().string_value()); + } + + if (!logRecord.severity_text().empty()) { + logEvent->SetContent("severity", logRecord.severity_text()); + } + + if (!logRecord.trace_id().empty()) { + logEvent->SetContent("trace_id", logRecord.trace_id()); + } + if (!logRecord.span_id().empty()) { + logEvent->SetContent("span_id", logRecord.span_id()); + } + + for (const auto& attr : logRecord.attributes()) { + if (attr.value().has_string_value()) { + logEvent->SetContent(attr.key(), attr.value().string_value()); + } else if (attr.value().has_int_value()) { + logEvent->SetContent(attr.key(), std::to_string(attr.value().int_value())); + } else if (attr.value().has_double_value()) { + logEvent->SetContent(attr.key(), std::to_string(attr.value().double_value())); + } + } + + // scope attributes as pseudo-tags on each event + if (scopeLogs.has_scope()) { + for (const auto& attr : scopeLogs.scope().attributes()) { + if (attr.value().has_string_value()) { + logEvent->SetContent("__tag__:" + attr.key(), attr.value().string_value()); + } + } + } + + eventCount++; + } + } + + if (eventCount > 0) { + totalEventCount += eventCount; + if (!ProcessorRunner::GetInstance()->PushQueue( + config->queueKey, config->inputIndex, std::move(eventGroup), 3)) { + allQueued = false; + } + } + } + + if (totalEventCount == 0) { + status = grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "No log events"); + } else { + status = allQueued ? grpc::Status::OK : grpc::Status(grpc::StatusCode::UNAVAILABLE, "Queue full, retry"); + } +} + +// ==================== Metrics Export ==================== + +grpc::ServerUnaryReactor* OTLPMetricsGrpcService::Export(grpc::CallbackServerContext* context, + const ExportMetricsServiceRequest* request, + ExportMetricsServiceResponse* response) { + auto* reactor = context->DefaultReactor(); + grpc::Status status(grpc::StatusCode::NOT_FOUND, "No matching config"); + + if (!request) { + ADD_COUNTER(mImpl->mDiscardedEventsTotal, 1); + reactor->Finish(grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Invalid request")); + return reactor; + } + + auto before = TimeKeeper::GetInstance()->NowMs(); + ADD_COUNTER(mImpl->mMetricInEventsTotal, 1); + + LOG_DEBUG(sLogger, + ("OTLPMetricsGrpcService Export", "processing")("resourceMetrics", request->resource_metrics().size())); + + std::shared_ptr config; + if (mImpl->FindMatchingConfig(context, config)) { + mImpl->ProcessMetricExport(request, config, status); + } else { + ADD_COUNTER(mImpl->mDiscardedEventsTotal, 1); + } + + ADD_COUNTER(mImpl->mTotalDelayMs, std::chrono::milliseconds(TimeKeeper::GetInstance()->NowMs() - before)); + reactor->Finish(status); + return reactor; +} + +void OTLPForwardServiceImpl::ProcessMetricExport(const ExportMetricsServiceRequest* request, + const std::shared_ptr& config, + grpc::Status& status) { + int totalEventCount = 0; + bool allQueued = true; + + // TODO: same partial-enqueue caveat as ProcessLogExport. + for (const auto& resourceMetrics : request->resource_metrics()) { + auto eventGroup = PipelineEventGroup(std::make_shared()); + int eventCount = 0; + + for (const auto& attr : resourceMetrics.resource().attributes()) { + if (attr.value().has_string_value()) { + eventGroup.SetTag(attr.key(), attr.value().string_value()); + } + } + + for (const auto& scopeMetrics : resourceMetrics.scope_metrics()) { + auto addScopeTags = [&](MetricEvent* metricEvent) { + if (scopeMetrics.has_scope()) { + for (const auto& attr : scopeMetrics.scope().attributes()) { + if (attr.value().has_string_value()) { + metricEvent->SetTag("__tag__:" + attr.key(), attr.value().string_value()); + } + } + } + }; + + for (const auto& metric : scopeMetrics.metrics()) { + switch (metric.data_case()) { + case opentelemetry::proto::metrics::v1::Metric::DataCase::kGauge: { + for (const auto& dp : metric.gauge().data_points()) { + auto* metricEvent = eventGroup.AddMetricEvent(true); + metricEvent->SetName(metric.name()); + metricEvent->SetValue(dp.as_double()); + addScopeTags(metricEvent); + eventCount++; + } + break; + } + case opentelemetry::proto::metrics::v1::Metric::DataCase::kSum: { + for (const auto& dp : metric.sum().data_points()) { + auto* metricEvent = eventGroup.AddMetricEvent(true); + metricEvent->SetName(metric.name()); + metricEvent->SetValue(dp.as_double()); + addScopeTags(metricEvent); + eventCount++; + } + break; + } + case opentelemetry::proto::metrics::v1::Metric::DataCase::kHistogram: { + for (const auto& dp : metric.histogram().data_points()) { + auto* metricEvent = eventGroup.AddMetricEvent(true); + metricEvent->SetName(metric.name()); + metricEvent->SetValue(dp.sum()); + addScopeTags(metricEvent); + eventCount++; + } + break; + } + default: + break; + } + } + } + + if (eventCount > 0) { + totalEventCount += eventCount; + if (!ProcessorRunner::GetInstance()->PushQueue( + config->queueKey, config->inputIndex, std::move(eventGroup), 3)) { + allQueued = false; + } + } + } + + if (totalEventCount == 0) { + status = grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "No metric events"); + } else { + status = allQueued ? grpc::Status::OK : grpc::Status(grpc::StatusCode::UNAVAILABLE, "Queue full, retry"); + } +} + +// ==================== Traces Export ==================== + +grpc::ServerUnaryReactor* OTLPTraceGrpcService::Export(grpc::CallbackServerContext* context, + const ExportTraceServiceRequest* request, + ExportTraceServiceResponse* response) { + auto* reactor = context->DefaultReactor(); + grpc::Status status(grpc::StatusCode::NOT_FOUND, "No matching config"); + + if (!request) { + ADD_COUNTER(mImpl->mDiscardedEventsTotal, 1); + reactor->Finish(grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Invalid request")); + return reactor; + } + + auto before = TimeKeeper::GetInstance()->NowMs(); + ADD_COUNTER(mImpl->mTraceInEventsTotal, 1); + + std::shared_ptr config; + if (mImpl->FindMatchingConfig(context, config)) { + mImpl->ProcessTraceExport(request, config, status); + } else { + ADD_COUNTER(mImpl->mDiscardedEventsTotal, 1); + } + + ADD_COUNTER(mImpl->mTotalDelayMs, std::chrono::milliseconds(TimeKeeper::GetInstance()->NowMs() - before)); + reactor->Finish(status); + return reactor; +} + +void OTLPForwardServiceImpl::ProcessTraceExport(const ExportTraceServiceRequest* request, + const std::shared_ptr& config, + grpc::Status& status) { + int totalEventCount = 0; + bool allQueued = true; + + // TODO: same partial-enqueue caveat as ProcessLogExport. + for (const auto& resourceSpans : request->resource_spans()) { + auto eventGroup = PipelineEventGroup(std::make_shared()); + int eventCount = 0; + + for (const auto& attr : resourceSpans.resource().attributes()) { + if (attr.value().has_string_value()) { + eventGroup.SetTag(attr.key(), attr.value().string_value()); + } + } + + for (const auto& scopeSpans : resourceSpans.scope_spans()) { + for (const auto& span : scopeSpans.spans()) { + auto* spanEvent = eventGroup.AddSpanEvent(true); + eventCount++; + + spanEvent->SetTraceId(span.trace_id()); + spanEvent->SetSpanId(span.span_id()); + spanEvent->SetParentSpanId(span.parent_span_id()); + spanEvent->SetName(span.name()); + + spanEvent->SetStartTimeNs(span.start_time_unix_nano()); + spanEvent->SetEndTimeNs(span.end_time_unix_nano()); + + if (span.has_status()) { + switch (span.status().code()) { + case opentelemetry::proto::trace::v1::Status::STATUS_CODE_OK: + spanEvent->SetStatus(SpanEvent::StatusCode::Ok); + break; + case opentelemetry::proto::trace::v1::Status::STATUS_CODE_ERROR: + spanEvent->SetStatus(SpanEvent::StatusCode::Error); + break; + default: + spanEvent->SetStatus(SpanEvent::StatusCode::Unset); + break; + } + } + + switch (span.kind()) { + case opentelemetry::proto::trace::v1::Span::SPAN_KIND_CLIENT: + spanEvent->SetKind(SpanEvent::Kind::Client); + break; + case opentelemetry::proto::trace::v1::Span::SPAN_KIND_SERVER: + spanEvent->SetKind(SpanEvent::Kind::Server); + break; + case opentelemetry::proto::trace::v1::Span::SPAN_KIND_PRODUCER: + spanEvent->SetKind(SpanEvent::Kind::Producer); + break; + case opentelemetry::proto::trace::v1::Span::SPAN_KIND_CONSUMER: + spanEvent->SetKind(SpanEvent::Kind::Consumer); + break; + default: + spanEvent->SetKind(SpanEvent::Kind::Internal); + break; + } + + for (const auto& attr : span.attributes()) { + if (attr.value().has_string_value()) { + spanEvent->SetTag(attr.key(), attr.value().string_value()); + } else if (attr.value().has_int_value()) { + spanEvent->SetTag(attr.key(), std::to_string(attr.value().int_value())); + } + } + + if (scopeSpans.has_scope()) { + for (const auto& attr : scopeSpans.scope().attributes()) { + if (attr.value().has_string_value()) { + spanEvent->SetTag("__tag__:" + attr.key(), attr.value().string_value()); + } + } + } + } + } + + if (eventCount > 0) { + totalEventCount += eventCount; + if (!ProcessorRunner::GetInstance()->PushQueue( + config->queueKey, config->inputIndex, std::move(eventGroup), 3)) { + allQueued = false; + } + } + } + + if (totalEventCount == 0) { + status = grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "No span events"); + } else { + status = allQueued ? grpc::Status::OK : grpc::Status(grpc::StatusCode::UNAVAILABLE, "Queue full, retry"); + } +} + +// ==================== Common ==================== + +bool OTLPForwardServiceImpl::AddToIndex(std::string& configName, OTLPForwardConfig&& config, std::string& errorMsg) { + errorMsg.clear(); + std::unique_lock lock(mMatchIndexMutex); + if (!configName.empty()) { + mMatchIndex[configName] = std::make_shared(std::move(config)); + return true; + } + errorMsg = "Empty config name"; + return false; +} + +bool OTLPForwardServiceImpl::FindMatchingConfig(grpc::CallbackServerContext* context, + std::shared_ptr& config) const { + std::shared_lock lock(mMatchIndexMutex); + + // Try to match via x-otlp-apm-configname metadata first + const auto& metadata = context->client_metadata(); + for (const auto& metadataPair : metadata) { + if (metadataPair.first != "x-otlp-apm-configname") { + continue; + } + std::string value(metadataPair.second.data(), metadataPair.second.size()); + auto it = mMatchIndex.find(value); + if (it != mMatchIndex.end()) { + config = it->second; + return true; + } + } + + // Fallback: if only one config is registered, use it (onetime pipeline case) + if (mMatchIndex.size() == 1) { + config = mMatchIndex.begin()->second; + return true; + } + + return false; +} + +} // namespace logtail diff --git a/core/forward/otlp/OTLPForwardService.h b/core/forward/otlp/OTLPForwardService.h new file mode 100644 index 0000000000..6ed3f1c7e0 --- /dev/null +++ b/core/forward/otlp/OTLPForwardService.h @@ -0,0 +1,131 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "collection_pipeline/queue/QueueKey.h" +#include "forward/BaseService.h" +#include "monitor/MetricManager.h" +#include "protobuf/opentelemetry/proto/collector/logs/v1/logs_service.grpc.pb.h" +#include "protobuf/opentelemetry/proto/collector/metrics/v1/metrics_service.grpc.pb.h" +#include "protobuf/opentelemetry/proto/collector/trace/v1/trace_service.grpc.pb.h" + +namespace logtail { + +struct OTLPForwardConfig { + std::string configName; + QueueKey queueKey; + size_t inputIndex; +}; + +class OTLPForwardServiceImpl; + +// gRPC service wrappers for Logs, Metrics, and Traces. +// Each inherits from exactly one CallbackService to avoid vtable method index conflicts +// (all three CallbackService types use method index 0). +class OTLPLogsGrpcService : public opentelemetry::proto::collector::logs::v1::LogsService::CallbackService { +public: + explicit OTLPLogsGrpcService(OTLPForwardServiceImpl* impl) : mImpl(impl) {} + grpc::ServerUnaryReactor* + Export(grpc::CallbackServerContext* context, + const opentelemetry::proto::collector::logs::v1::ExportLogsServiceRequest* request, + opentelemetry::proto::collector::logs::v1::ExportLogsServiceResponse* response) override; + +private: + OTLPForwardServiceImpl* mImpl; +}; + +class OTLPMetricsGrpcService : public opentelemetry::proto::collector::metrics::v1::MetricsService::CallbackService { +public: + explicit OTLPMetricsGrpcService(OTLPForwardServiceImpl* impl) : mImpl(impl) {} + grpc::ServerUnaryReactor* + Export(grpc::CallbackServerContext* context, + const opentelemetry::proto::collector::metrics::v1::ExportMetricsServiceRequest* request, + opentelemetry::proto::collector::metrics::v1::ExportMetricsServiceResponse* response) override; + +private: + OTLPForwardServiceImpl* mImpl; +}; + +class OTLPTraceGrpcService : public opentelemetry::proto::collector::trace::v1::TraceService::CallbackService { +public: + explicit OTLPTraceGrpcService(OTLPForwardServiceImpl* impl) : mImpl(impl) {} + grpc::ServerUnaryReactor* + Export(grpc::CallbackServerContext* context, + const opentelemetry::proto::collector::trace::v1::ExportTraceServiceRequest* request, + opentelemetry::proto::collector::trace::v1::ExportTraceServiceResponse* response) override; + +private: + OTLPForwardServiceImpl* mImpl; +}; + +class OTLPForwardServiceImpl : public BaseService { +public: + OTLPForwardServiceImpl(const std::string& address); + ~OTLPForwardServiceImpl() override = default; + + bool Update(std::string configName, const Json::Value& config) override; + bool Remove(std::string configName, const Json::Value& config) override; + [[nodiscard]] const std::string& Name() const override { return sName; } + std::vector<::grpc::Service*> GetGrpcServices() override; + +private: + static const std::string sName; + + // configName -> OTLPForwardConfig + std::unordered_map> mMatchIndex; + mutable std::shared_mutex mMatchIndexMutex; + + MetricsRecordRef mMetricsRecordRef; + CounterPtr mLogInEventsTotal; + CounterPtr mLogInSizeBytes; + CounterPtr mMetricInEventsTotal; + CounterPtr mTraceInEventsTotal; + CounterPtr mDiscardedEventsTotal; + TimeCounterPtr mTotalDelayMs; + + // gRPC service wrappers for Logs, Metrics, and Traces + std::unique_ptr mLogsService; + std::unique_ptr mMetricsService; + std::unique_ptr mTraceService; + + bool AddToIndex(std::string& configName, OTLPForwardConfig&& config, std::string& errorMsg); + bool FindMatchingConfig(grpc::CallbackServerContext* context, std::shared_ptr& config) const; + void ProcessLogExport(const opentelemetry::proto::collector::logs::v1::ExportLogsServiceRequest* request, + const std::shared_ptr& config, + grpc::Status& status); + void ProcessMetricExport(const opentelemetry::proto::collector::metrics::v1::ExportMetricsServiceRequest* request, + const std::shared_ptr& config, + grpc::Status& status); + void ProcessTraceExport(const opentelemetry::proto::collector::trace::v1::ExportTraceServiceRequest* request, + const std::shared_ptr& config, + grpc::Status& status); + +#ifdef APSARA_UNIT_TEST_MAIN + friend class OTLPForwardServiceUnittest; +#endif + friend class OTLPLogsGrpcService; + friend class OTLPMetricsGrpcService; + friend class OTLPTraceGrpcService; +}; + +} // namespace logtail diff --git a/core/monitor/metric_constants/MetricConstants.h b/core/monitor/metric_constants/MetricConstants.h index 747b1e0b78..f219178ea7 100644 --- a/core/monitor/metric_constants/MetricConstants.h +++ b/core/monitor/metric_constants/MetricConstants.h @@ -295,6 +295,7 @@ extern const std::string METRIC_LABEL_KEY_THREAD_NO; extern const std::string METRIC_LABEL_VALUE_RUNNER_NAME_FILE_SERVER; extern const std::string METRIC_LABEL_VALUE_RUNNER_NAME_FLUSHER; extern const std::string METRIC_LABEL_VALUE_RUNNER_NAME_HTTP_SINK; +extern const std::string METRIC_LABEL_VALUE_RUNNER_NAME_GRPC_SINK; extern const std::string METRIC_LABEL_VALUE_RUNNER_NAME_PROCESSOR; extern const std::string METRIC_LABEL_VALUE_RUNNER_NAME_PROMETHEUS; extern const std::string METRIC_LABEL_VALUE_RUNNER_NAME_EBPF_SERVER; diff --git a/core/monitor/metric_constants/RunnerMetrics.cpp b/core/monitor/metric_constants/RunnerMetrics.cpp index 53b3625a47..f57767c1c6 100644 --- a/core/monitor/metric_constants/RunnerMetrics.cpp +++ b/core/monitor/metric_constants/RunnerMetrics.cpp @@ -27,6 +27,7 @@ const string METRIC_LABEL_KEY_THREAD_NO = "thread_no"; const string METRIC_LABEL_VALUE_RUNNER_NAME_FILE_SERVER = "file_server"; const string METRIC_LABEL_VALUE_RUNNER_NAME_FLUSHER = "flusher_runner"; const string METRIC_LABEL_VALUE_RUNNER_NAME_HTTP_SINK = "http_sink"; +const string METRIC_LABEL_VALUE_RUNNER_NAME_GRPC_SINK = "grpc_sink"; const string METRIC_LABEL_VALUE_RUNNER_NAME_PROCESSOR = "processor_runner"; const string METRIC_LABEL_VALUE_RUNNER_NAME_PROMETHEUS = "prometheus_runner"; const string METRIC_LABEL_VALUE_RUNNER_NAME_EBPF_SERVER = "ebpf_runner"; diff --git a/core/plugin/flusher/flusher.cmake b/core/plugin/flusher/flusher.cmake index 09c494478e..eb1d9314de 100644 --- a/core/plugin/flusher/flusher.cmake +++ b/core/plugin/flusher/flusher.cmake @@ -19,10 +19,13 @@ include_directories(flusher) # Add source files if (NOT ENABLE_ENTERPRISE AND UNIX) file(GLOB_RECURSE THIS_SOURCE_FILES_LIST ${CMAKE_SOURCE_DIR}/plugin/flusher/*.c ${CMAKE_SOURCE_DIR}/plugin/flusher/*.cc ${CMAKE_SOURCE_DIR}/plugin/flusher/*.cpp ${CMAKE_SOURCE_DIR}/plugin/flusher/*.h) +elseif(UNIX) + file(GLOB_RECURSE THIS_SOURCE_FILES_LIST ${CMAKE_SOURCE_DIR}/plugin/flusher/*.c ${CMAKE_SOURCE_DIR}/plugin/flusher/*.cc ${CMAKE_SOURCE_DIR}/plugin/flusher/*.cpp ${CMAKE_SOURCE_DIR}/plugin/flusher/*.h) + list(FILTER THIS_SOURCE_FILES_LIST EXCLUDE REGEX ".*kafka.*") else() - # Exclude kafka files on non-UNIX systems file(GLOB_RECURSE THIS_SOURCE_FILES_LIST ${CMAKE_SOURCE_DIR}/plugin/flusher/*.c ${CMAKE_SOURCE_DIR}/plugin/flusher/*.cc ${CMAKE_SOURCE_DIR}/plugin/flusher/*.cpp ${CMAKE_SOURCE_DIR}/plugin/flusher/*.h) list(FILTER THIS_SOURCE_FILES_LIST EXCLUDE REGEX ".*kafka.*") + list(FILTER THIS_SOURCE_FILES_LIST EXCLUDE REGEX ".*opentelemetry.*") endif() # Set source files to parent diff --git a/core/plugin/flusher/opentelemetry/FlusherOTLPHttpNative.cpp b/core/plugin/flusher/opentelemetry/FlusherOTLPHttpNative.cpp new file mode 100644 index 0000000000..c14fe25be9 --- /dev/null +++ b/core/plugin/flusher/opentelemetry/FlusherOTLPHttpNative.cpp @@ -0,0 +1,316 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/flusher/opentelemetry/FlusherOTLPHttpNative.h" + +#include "collection_pipeline/queue/QueueKeyManager.h" +#include "collection_pipeline/queue/SenderQueueManager.h" +#include "collection_pipeline/serializer/OTLPHttpSerializer.h" +#include "common/Flags.h" +#include "common/ParamExtractor.h" +#include "common/http/HttpRequest.h" +#include "logger/Logger.h" +#include "runner/sink/http/HttpSinkRequest.h" + +DECLARE_FLAG_INT32(discard_send_fail_interval); + +namespace logtail { + +const std::string FlusherOTLPHttpNative::sName = "flusher_otlp_http_native"; + +bool FlusherOTLPHttpNative::Init(const Json::Value& config, Json::Value& optionalGoPipeline) { + std::string errorMsg; + + if (!GetMandatoryStringParam(config, "Url", mUrl, errorMsg)) { + PARAM_ERROR_RETURN(mContext->GetLogger(), + mContext->GetAlarm(), + errorMsg, + sName, + mContext->GetConfigName(), + mContext->GetProjectName(), + mContext->GetLogstoreName(), + mContext->GetRegion()); + } + + // Validate URL: reject IPv6 and verify port + { + std::string urlForCheck = mUrl; + size_t schemeEnd = urlForCheck.find("://"); + if (schemeEnd != std::string::npos) { + urlForCheck = urlForCheck.substr(schemeEnd + 3); + } + size_t pathStart = urlForCheck.find('/'); + std::string hostPort = (pathStart != std::string::npos) ? urlForCheck.substr(0, pathStart) : urlForCheck; + + if (hostPort.find('[') != std::string::npos) { + errorMsg = "IPv6 addresses are not supported, Url: " + mUrl; + PARAM_ERROR_RETURN(mContext->GetLogger(), + mContext->GetAlarm(), + errorMsg, + sName, + mContext->GetConfigName(), + mContext->GetProjectName(), + mContext->GetLogstoreName(), + mContext->GetRegion()); + } + + size_t portSep = hostPort.find(':'); + if (portSep != std::string::npos) { + std::string portStr = hostPort.substr(portSep + 1); + try { + int port = std::stoi(portStr); + if (port <= 0 || port > 65535) { + errorMsg = "invalid port number in Url: " + mUrl; + PARAM_ERROR_RETURN(mContext->GetLogger(), + mContext->GetAlarm(), + errorMsg, + sName, + mContext->GetConfigName(), + mContext->GetProjectName(), + mContext->GetLogstoreName(), + mContext->GetRegion()); + } + } catch (const std::exception&) { + errorMsg = "invalid port in Url: " + mUrl; + PARAM_ERROR_RETURN(mContext->GetLogger(), + mContext->GetAlarm(), + errorMsg, + sName, + mContext->GetConfigName(), + mContext->GetProjectName(), + mContext->GetLogstoreName(), + mContext->GetRegion()); + } + } + } + + // Optional format: "protobuf" (default) or "json" + std::string formatStr; + if (config.isMember("Format")) { + formatStr = config["Format"].asString(); + if (formatStr == "protobuf") { + mFormat = OTLPHttpFormat::Protobuf; + } else if (formatStr == "json") { + mFormat = OTLPHttpFormat::JSON; + } else { + LOG_WARNING(sLogger, + ("FlusherOTLPHttpNative invalid Format value", + formatStr)("action", "use default json format")("plugin", sName)); + } + } + + if (config.isMember("EnableTLS")) { + mEnableTLS = config["EnableTLS"].asBool(); + } + + // Parse extra headers + if (config.isMember("Headers") && config["Headers"].isObject()) { + const auto& headers = config["Headers"]; + auto memberNames = headers.getMemberNames(); + for (const auto& name : memberNames) { + mExtraHeaders[name] = headers[name].asString(); + } + } + + // Create sender queue + GenerateQueueKey(mUrl); + SenderQueueManager::GetInstance()->CreateQueue(mQueueKey, mPluginID, mUrl, *mContext); + + // Create OTLP serializer (used for JSON path) + mSerializer = std::make_unique(this); + + // Metrics + mSendCnt = GetMetricsRecordRef().CreateCounter(METRIC_PLUGIN_OUT_EVENT_GROUPS_TOTAL); + mSendSuccessCnt = GetMetricsRecordRef().CreateCounter(METRIC_PLUGIN_OUT_SUCCESSFUL_EVENTS_TOTAL); + mSendFailCnt = GetMetricsRecordRef().CreateCounter(METRIC_PLUGIN_OUT_FAILED_EVENTS_TOTAL); + mDiscardedEventsTotal = GetMetricsRecordRef().CreateCounter(METRIC_PLUGIN_DISCARDED_EVENTS_TOTAL); + + LOG_INFO(sLogger, + ("FlusherOTLPHttpNative initialized", "success")("url", mUrl)( + "format", mFormat == OTLPHttpFormat::Protobuf ? "protobuf" : "json")("tls", mEnableTLS)); + return true; +} + +bool FlusherOTLPHttpNative::Start() { + LOG_INFO(sLogger, ("FlusherOTLPHttpNative started", "success")("url", mUrl)); + return true; +} + +bool FlusherOTLPHttpNative::Stop(bool isPipelineRemoving) { + LOG_INFO(sLogger, ("FlusherOTLPHttpNative stopped", "success")); + return Flusher::Stop(isPipelineRemoving); +} + +bool FlusherOTLPHttpNative::Send(PipelineEventGroup&& g) { + ADD_COUNTER(mSendCnt, 1); + if (mFormat == OTLPHttpFormat::Protobuf) { + return SerializeAndPushProtobuf(std::move(g)); + } + return SerializeAndPush(std::move(g)); +} + +bool FlusherOTLPHttpNative::SerializeAndPush(PipelineEventGroup&& group) { + if (group.GetEvents().empty()) { + return true; + } + + BatchedEvents batched(std::move(group.MutableEvents()), + std::move(group.GetSizedTags()), + std::move(group.GetSourceBuffer()), + group.GetMetadata(EventGroupMetaKey::SOURCE_ID), + std::move(group.GetExactlyOnceCheckpoint())); + for (const auto& extraSourceBuffer : group.GetExtraSourceBuffers()) { + batched.mSourceBuffers.emplace_back(extraSourceBuffer); + } + + std::string serializedData, errorMsg; + if (!mSerializer->DoSerialize(std::move(batched), serializedData, errorMsg)) { + LOG_WARNING(sLogger, + ("failed to serialize OTLP event group", + errorMsg)("action", "discard data")("plugin", sName)("config", mContext->GetConfigName())); + return false; + } + + const size_t rawSize = serializedData.size(); + auto item = std::make_unique( + std::move(serializedData), rawSize, this, mQueueKey, RawDataType::EVENT_GROUP); + + return PushToQueue(std::move(item)); +} + +bool FlusherOTLPHttpNative::SerializeAndPushProtobuf(PipelineEventGroup&& group) { + if (group.GetEvents().empty()) { + return true; + } + + BatchedEvents batched(std::move(group.MutableEvents()), + std::move(group.GetSizedTags()), + std::move(group.GetSourceBuffer()), + group.GetMetadata(EventGroupMetaKey::SOURCE_ID), + std::move(group.GetExactlyOnceCheckpoint())); + for (const auto& extraSourceBuffer : group.GetExtraSourceBuffers()) { + batched.mSourceBuffers.emplace_back(extraSourceBuffer); + } + + std::string serializedData, errorMsg; + if (!mSerializer->SerializeToBinaryString(std::move(batched), serializedData, errorMsg)) { + LOG_WARNING(sLogger, + ("failed to serialize OTLP event group to protobuf", + errorMsg)("action", "discard data")("plugin", sName)("config", mContext->GetConfigName())); + return false; + } + + const size_t rawSize = serializedData.size(); + auto item = std::make_unique( + std::move(serializedData), rawSize, this, mQueueKey, RawDataType::EVENT_GROUP); + + return PushToQueue(std::move(item)); +} + +bool FlusherOTLPHttpNative::Flush(size_t key) { + return true; +} + +bool FlusherOTLPHttpNative::FlushAll() { + return true; +} + +bool FlusherOTLPHttpNative::BuildRequest(SenderQueueItem* item, + std::unique_ptr& req, + bool* keepItem, + std::string* errMsg) { + *keepItem = true; + + if (item->mData.empty()) { + *keepItem = false; + return true; + } + + std::string body = item->mData; + + std::map headers; + headers["Content-Type"] = (mFormat == OTLPHttpFormat::Protobuf) ? "application/x-protobuf" : "application/json"; + for (const auto& [key, val] : mExtraHeaders) { + headers[key] = val; + } + + // Parse URL to extract host, port, and path + std::string url = mUrl; + std::string host; + int32_t port = mEnableTLS ? 443 : 80; + std::string path; + + // Remove scheme + size_t schemeEnd = url.find("://"); + if (schemeEnd != std::string::npos) { + url = url.substr(schemeEnd + 3); + } + + // Extract host and path + size_t pathStart = url.find('/'); + if (pathStart != std::string::npos) { + host = url.substr(0, pathStart); + path = url.substr(pathStart); + } else { + host = url; + path = "/"; + } + + // Extract port if present + size_t portSep = host.find(':'); + if (portSep != std::string::npos) { + port = std::stoi(host.substr(portSep + 1)); + host = host.substr(0, portSep); + } + + req = std::make_unique("POST", mEnableTLS, host, port, path, "", headers, std::move(body), item); + return true; +} + +void FlusherOTLPHttpNative::OnSendDone(const HttpResponse& response, SenderQueueItem* item) { + int32_t statusCode = response.GetStatusCode(); + if (statusCode >= 200 && statusCode < 300) { + ADD_COUNTER(mSendSuccessCnt, 1); + SenderQueueManager::GetInstance()->DecreaseConcurrencyLimiterInSendingCnt(item->mQueueKey); + DealSenderQueueItemAfterSend(item, false); + return; + } + + ADD_COUNTER(mSendFailCnt, 1); + bool shouldDiscard = false; + if (statusCode == 400 || statusCode == 413 || statusCode == 403 || statusCode == 404) { + shouldDiscard = true; + } + auto age + = std::chrono::duration_cast(std::chrono::system_clock::now() - item->mFirstEnqueTime) + .count(); + if (age > INT32_FLAG(discard_send_fail_interval)) { + shouldDiscard = true; + } + + if (shouldDiscard) { + LOG_WARNING(sLogger, + ("FlusherOTLPHttpNative discard item", statusCode)("age_s", age)( + "config-flusher-dst", QueueKeyManager::GetInstance()->GetName(item->mQueueKey))); + ADD_COUNTER(mDiscardedEventsTotal, 1); + } else { + LOG_WARNING(sLogger, ("FlusherOTLPHttpNative response error, will retry", statusCode)); + } + SenderQueueManager::GetInstance()->DecreaseConcurrencyLimiterInSendingCnt(item->mQueueKey); + DealSenderQueueItemAfterSend(item, !shouldDiscard); +} + +} // namespace logtail diff --git a/core/plugin/flusher/opentelemetry/FlusherOTLPHttpNative.h b/core/plugin/flusher/opentelemetry/FlusherOTLPHttpNative.h new file mode 100644 index 0000000000..6e54372a7f --- /dev/null +++ b/core/plugin/flusher/opentelemetry/FlusherOTLPHttpNative.h @@ -0,0 +1,70 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "collection_pipeline/plugin/interface/HttpFlusher.h" +#include "collection_pipeline/serializer/OTLPHttpSerializer.h" + +namespace logtail { + +enum class OTLPHttpFormat : uint8_t { JSON = 0, Protobuf = 1 }; + +class FlusherOTLPHttpNative : public HttpFlusher { +public: + static const std::string sName; + + const std::string& Name() const override { return sName; } + bool Init(const Json::Value& config, Json::Value& optionalGoPipeline) override; + bool Start() override; + bool Stop(bool isPipelineRemoving) override; + bool Send(PipelineEventGroup&& g) override; + bool Flush(size_t key) override; + bool FlushAll() override; + bool BuildRequest(SenderQueueItem* item, + std::unique_ptr& req, + bool* keepItem, + std::string* errMsg) override; + void OnSendDone(const HttpResponse& response, SenderQueueItem* item) override; + + const std::string& GetUrl() const { return mUrl; } + OTLPHttpFormat GetFormat() const { return mFormat; } + +private: + std::string mUrl; + OTLPHttpFormat mFormat = OTLPHttpFormat::Protobuf; + bool mEnableTLS = false; + std::unordered_map mExtraHeaders; + + CounterPtr mSendCnt; + CounterPtr mSendSuccessCnt; + CounterPtr mSendFailCnt; + CounterPtr mDiscardedEventsTotal; + + std::unique_ptr mSerializer; + + bool SerializeAndPush(PipelineEventGroup&& group); + bool SerializeAndPushProtobuf(PipelineEventGroup&& group); + +#ifdef APSARA_UNIT_TEST_MAIN + friend class FlusherOTLPHttpNativeUnittest; +#endif +}; + +} // namespace logtail diff --git a/core/plugin/flusher/opentelemetry/FlusherOTLPNative.cpp b/core/plugin/flusher/opentelemetry/FlusherOTLPNative.cpp new file mode 100644 index 0000000000..7f991b5fa6 --- /dev/null +++ b/core/plugin/flusher/opentelemetry/FlusherOTLPNative.cpp @@ -0,0 +1,519 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/flusher/opentelemetry/FlusherOTLPNative.h" + +#include + +#include "collection_pipeline/queue/QueueKeyManager.h" +#include "collection_pipeline/queue/SenderQueueManager.h" +#include "common/Flags.h" +#include "common/ParamExtractor.h" +#include "logger/Logger.h" +#include "models/LogEvent.h" +#include "models/MetricEvent.h" +#include "models/MetricValue.h" +#include "models/RawEvent.h" +#include "models/SpanEvent.h" + +DECLARE_FLAG_INT32(discard_send_fail_interval); + +namespace logtail { + +const std::string FlusherOTLPNative::sName = "flusher_otlp_native"; + +using OTLPGrpcDataType = OTLPGrpcCallContext::DataType; + +bool FlusherOTLPNative::Init(const Json::Value& config, Json::Value& /*optionalGoPipeline*/) { + std::string errorMsg; + + if (!GetMandatoryStringParam(config, "Endpoint", mEndpoint, errorMsg)) { + PARAM_ERROR_RETURN(mContext->GetLogger(), + mContext->GetAlarm(), + errorMsg, + sName, + mContext->GetConfigName(), + mContext->GetProjectName(), + mContext->GetLogstoreName(), + mContext->GetRegion()); + } + + if (config.isMember("TimeoutMs")) { + mTimeoutMs = config["TimeoutMs"].asInt(); + } + if (config.isMember("EnableTLS")) { + mEnableTLS = config["EnableTLS"].asBool(); + } + + // Parse extra headers + if (config.isMember("Headers") && config["Headers"].isObject()) { + const auto& headers = config["Headers"]; + auto memberNames = headers.getMemberNames(); + for (const auto& name : memberNames) { + mHeaders[name] = headers[name].asString(); + } + } + + // Create sender queue + GenerateQueueKey(mEndpoint); + SenderQueueManager::GetInstance()->CreateQueue(mQueueKey, mPluginID, mEndpoint, *mContext); + + // Metrics + mSendCnt = GetMetricsRecordRef().CreateCounter(METRIC_PLUGIN_OUT_EVENT_GROUPS_TOTAL); + mSendSuccessCnt = GetMetricsRecordRef().CreateCounter(METRIC_PLUGIN_OUT_SUCCESSFUL_EVENTS_TOTAL); + mSendFailCnt = GetMetricsRecordRef().CreateCounter(METRIC_PLUGIN_OUT_FAILED_EVENTS_TOTAL); + mDiscardedEventsTotal = GetMetricsRecordRef().CreateCounter(METRIC_PLUGIN_DISCARDED_EVENTS_TOTAL); + + LOG_INFO(sLogger, + ("FlusherOTLPNative initialized", "success")("endpoint", mEndpoint)("timeout_ms", mTimeoutMs)( + "tls", mEnableTLS)("headers", mHeaders.size())); + return true; +} + +bool FlusherOTLPNative::Start() { + if (!CreateGrpcChannel()) { + return false; + } + LOG_INFO(sLogger, ("FlusherOTLPNative started", "success")("endpoint", mEndpoint)); + return true; +} + +bool FlusherOTLPNative::Stop(bool isPipelineRemoving) { + mIsStopping.store(true); + + { + std::unique_lock lock(mStopMutex); + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(5); + mStopCV.wait_until(lock, deadline, [this]() { return mInFlightCnt.load() == 0; }); + } + + if (mInFlightCnt.load() > 0) { + LOG_WARNING(sLogger, ("FlusherOTLPNative Stop timeout, cancelling in-flight RPCs", mInFlightCnt.load())); + CancelAllInFlight(); + + std::unique_lock lock(mStopMutex); + auto cancelDeadline = std::chrono::steady_clock::now() + std::chrono::seconds(3); + mStopCV.wait_until(lock, cancelDeadline, [this]() { return mInFlightCnt.load() == 0; }); + if (mInFlightCnt.load() > 0) { + LOG_ERROR(sLogger, ("FlusherOTLPNative cancelled RPCs still not drained", mInFlightCnt.load())); + } + } + + mChannel.reset(); + mLogsStub.reset(); + mMetricsStub.reset(); + mTraceStub.reset(); + + return Flusher::Stop(isPipelineRemoving); +} + +// ==================== Send (SenderQueue pattern) ==================== + +bool FlusherOTLPNative::Send(PipelineEventGroup&& g) { + if (g.GetEvents().empty()) { + return true; + } + + ADD_COUNTER(mSendCnt, 1); + + const auto& events = g.GetEvents(); + bool hasLog = false, hasMetric = false, hasSpan = false; + for (const auto& event : events) { + if (event.Is() || event.Is()) + hasLog = true; + else if (event.Is()) + hasMetric = true; + else if (event.Is()) + hasSpan = true; + } + + if (!hasLog && !hasMetric && !hasSpan) { + return true; + } + + bool allOk = true; + std::string errMsg; + + if (hasLog) { + std::string data; + if (!SerializeLogsToOTLP(g, data, errMsg)) { + LOG_WARNING(sLogger, ("failed to serialize OTLP logs", errMsg)("config", mContext->GetConfigName())); + allOk = false; + } else { + const size_t rawSize = data.size(); + auto item = std::make_unique( + std::move(data), rawSize, this, mQueueKey, OTLPGrpcDataType::Logs); + allOk &= PushToQueue(std::move(item)); + } + } + + if (hasMetric) { + std::string data; + if (!SerializeMetricsToOTLP(g, data, errMsg)) { + LOG_WARNING(sLogger, ("failed to serialize OTLP metrics", errMsg)("config", mContext->GetConfigName())); + allOk = false; + } else { + const size_t rawSize = data.size(); + auto item = std::make_unique( + std::move(data), rawSize, this, mQueueKey, OTLPGrpcDataType::Metrics); + allOk &= PushToQueue(std::move(item)); + } + } + + if (hasSpan) { + std::string data; + if (!SerializeTracesToOTLP(g, data, errMsg)) { + LOG_WARNING(sLogger, ("failed to serialize OTLP traces", errMsg)("config", mContext->GetConfigName())); + allOk = false; + } else { + const size_t rawSize = data.size(); + auto item = std::make_unique( + std::move(data), rawSize, this, mQueueKey, OTLPGrpcDataType::Traces); + allOk &= PushToQueue(std::move(item)); + } + } + + return allOk; +} + +// ==================== Serialize helpers ==================== + +bool FlusherOTLPNative::SerializeLogsToOTLP(const PipelineEventGroup& group, + std::string& serializedData, + std::string& errMsg) { + opentelemetry::proto::collector::logs::v1::ExportLogsServiceRequest request; + auto* resourceLogs = request.add_resource_logs(); + auto* scopeLogs = resourceLogs->add_scope_logs(); + + for (const auto& event : group.GetEvents()) { + if (event.Is()) { + const auto& logEvent = event.Cast(); + auto* logRecord = scopeLogs->add_log_records(); + auto ts = logEvent.GetTimestamp(); + auto tsNs = logEvent.GetTimestampNanosecond(); + uint64_t timeUnixNano = static_cast(ts) * 1000000000ULL; + if (tsNs.has_value()) { + timeUnixNano += tsNs.value(); + } + logRecord->set_time_unix_nano(timeUnixNano); + auto* body = logRecord->mutable_body(); + auto msg = logEvent.GetContent("content"); + if (!msg.empty()) { + body->set_string_value(std::string(msg.data(), msg.size())); + } + auto level = logEvent.GetLevel(); + if (!level.empty()) { + logRecord->set_severity_text(std::string(level.data(), level.size())); + } + for (auto it = logEvent.begin(); it != logEvent.end(); ++it) { + auto* attr = logRecord->add_attributes(); + attr->set_key(std::string(it->first.data(), it->first.size())); + attr->mutable_value()->set_string_value(std::string(it->second.data(), it->second.size())); + } + } else if (event.Is()) { + const auto& rawEvent = event.Cast(); + auto* logRecord = scopeLogs->add_log_records(); + auto ts = rawEvent.GetTimestamp(); + auto tsNs = rawEvent.GetTimestampNanosecond(); + uint64_t timeUnixNano = static_cast(ts) * 1000000000ULL; + if (tsNs.has_value()) { + timeUnixNano += tsNs.value(); + } + logRecord->set_time_unix_nano(timeUnixNano); + auto* body = logRecord->mutable_body(); + body->set_string_value(std::string(rawEvent.GetContent().data(), rawEvent.GetContent().size())); + } + } + + if (!request.SerializeToString(&serializedData)) { + errMsg = "failed to serialize logs request"; + return false; + } + return true; +} + +bool FlusherOTLPNative::SerializeMetricsToOTLP(const PipelineEventGroup& group, + std::string& serializedData, + std::string& errMsg) { + opentelemetry::proto::collector::metrics::v1::ExportMetricsServiceRequest request; + auto* resourceMetrics = request.add_resource_metrics(); + auto* scopeMetrics = resourceMetrics->add_scope_metrics(); + + for (const auto& event : group.GetEvents()) { + if (!event.Is()) + continue; + const auto& metricEvent = event.Cast(); + auto* metric = scopeMetrics->add_metrics(); + metric->set_name(std::string(metricEvent.GetName().data(), metricEvent.GetName().size())); + + auto* gauge = metric->mutable_gauge(); + auto* dp = gauge->add_data_points(); + + auto ts = metricEvent.GetTimestamp(); + auto tsNs = metricEvent.GetTimestampNanosecond(); + uint64_t timeUnixNano = static_cast(ts) * 1000000000ULL; + if (tsNs.has_value()) { + timeUnixNano += tsNs.value(); + } + dp->set_time_unix_nano(timeUnixNano); + + if (auto* untyped = metricEvent.GetValue()) { + dp->set_as_double(untyped->mValue); + } else { + dp->set_as_double(0.0); + } + + for (auto tagIt = metricEvent.TagsBegin(); tagIt != metricEvent.TagsEnd(); ++tagIt) { + auto* label = dp->add_attributes(); + label->set_key(std::string(tagIt->first.data(), tagIt->first.size())); + label->mutable_value()->set_string_value(std::string(tagIt->second.data(), tagIt->second.size())); + } + } + + if (!request.SerializeToString(&serializedData)) { + errMsg = "failed to serialize metrics request"; + return false; + } + return true; +} + +bool FlusherOTLPNative::SerializeTracesToOTLP(const PipelineEventGroup& group, + std::string& serializedData, + std::string& errMsg) { + opentelemetry::proto::collector::trace::v1::ExportTraceServiceRequest request; + auto* resourceSpans = request.add_resource_spans(); + auto* scopeSpans = resourceSpans->add_scope_spans(); + + for (const auto& event : group.GetEvents()) { + if (!event.Is()) + continue; + const auto& spanEvent = event.Cast(); + auto* span = scopeSpans->add_spans(); + + span->set_trace_id(std::string(spanEvent.GetTraceId().data(), spanEvent.GetTraceId().size())); + span->set_span_id(std::string(spanEvent.GetSpanId().data(), spanEvent.GetSpanId().size())); + span->set_parent_span_id(std::string(spanEvent.GetParentSpanId().data(), spanEvent.GetParentSpanId().size())); + span->set_name(std::string(spanEvent.GetName().data(), spanEvent.GetName().size())); + span->set_start_time_unix_nano(spanEvent.GetStartTimeNs()); + span->set_end_time_unix_nano(spanEvent.GetEndTimeNs()); + + switch (spanEvent.GetKind()) { + case SpanEvent::Kind::Internal: + span->set_kind(opentelemetry::proto::trace::v1::Span::SPAN_KIND_INTERNAL); + break; + case SpanEvent::Kind::Server: + span->set_kind(opentelemetry::proto::trace::v1::Span::SPAN_KIND_SERVER); + break; + case SpanEvent::Kind::Client: + span->set_kind(opentelemetry::proto::trace::v1::Span::SPAN_KIND_CLIENT); + break; + case SpanEvent::Kind::Producer: + span->set_kind(opentelemetry::proto::trace::v1::Span::SPAN_KIND_PRODUCER); + break; + case SpanEvent::Kind::Consumer: + span->set_kind(opentelemetry::proto::trace::v1::Span::SPAN_KIND_CONSUMER); + break; + default: + span->set_kind(opentelemetry::proto::trace::v1::Span::SPAN_KIND_UNSPECIFIED); + break; + } + + switch (spanEvent.GetStatus()) { + case SpanEvent::StatusCode::Ok: + span->mutable_status()->set_code(opentelemetry::proto::trace::v1::Status::STATUS_CODE_OK); + break; + case SpanEvent::StatusCode::Error: + span->mutable_status()->set_code(opentelemetry::proto::trace::v1::Status::STATUS_CODE_ERROR); + break; + default: + span->mutable_status()->set_code(opentelemetry::proto::trace::v1::Status::STATUS_CODE_UNSET); + break; + } + + for (auto tagIt = spanEvent.TagsBegin(); tagIt != spanEvent.TagsEnd(); ++tagIt) { + auto* attr = span->add_attributes(); + attr->set_key(std::string(tagIt->first.data(), tagIt->first.size())); + attr->mutable_value()->set_string_value(std::string(tagIt->second.data(), tagIt->second.size())); + } + } + + if (!request.SerializeToString(&serializedData)) { + errMsg = "failed to serialize traces request"; + return false; + } + return true; +} + +// ==================== BuildGrpcRequest (reconstruct from binary) ==================== + +bool FlusherOTLPNative::BuildGrpcRequest(SenderQueueItem* item, + std::unique_ptr& ctx, + bool* keepItem, + std::string* errMsg) { + auto* otlpItem = static_cast(item); + ctx = std::make_unique(); + ctx->item = item; + ctx->type = otlpItem->dataType; + + switch (otlpItem->dataType) { + case OTLPGrpcDataType::Logs: { + ctx->logsReq = std::make_unique(); + ctx->logsResp = std::make_unique(); + if (!ctx->logsReq->ParseFromString(item->mData)) { + *errMsg = "failed to parse logs request from binary"; + *keepItem = false; + return false; + } + break; + } + case OTLPGrpcDataType::Metrics: { + ctx->metricsReq + = std::make_unique(); + ctx->metricsResp + = std::make_unique(); + if (!ctx->metricsReq->ParseFromString(item->mData)) { + *errMsg = "failed to parse metrics request from binary"; + *keepItem = false; + return false; + } + break; + } + case OTLPGrpcDataType::Traces: { + ctx->traceReq = std::make_unique(); + ctx->traceResp = std::make_unique(); + if (!ctx->traceReq->ParseFromString(item->mData)) { + *errMsg = "failed to parse traces request from binary"; + *keepItem = false; + return false; + } + break; + } + default: + *errMsg = "unknown OTLP data type in queue item"; + *keepItem = false; + return false; + } + + return true; +} + +// ==================== OnSendDone ==================== + +void FlusherOTLPNative::OnSendDone(const grpc::Status& status, SenderQueueItem* item) { + if (status.ok()) { + SenderQueueManager::GetInstance()->DecreaseConcurrencyLimiterInSendingCnt(item->mQueueKey); + DealSenderQueueItemAfterSend(item, false); + return; + } + + bool shouldDiscard = false; + auto code = status.error_code(); + if (code == grpc::StatusCode::INVALID_ARGUMENT || code == grpc::StatusCode::UNIMPLEMENTED + || code == grpc::StatusCode::PERMISSION_DENIED || code == grpc::StatusCode::UNAUTHENTICATED) { + shouldDiscard = true; + } + auto age + = std::chrono::duration_cast(std::chrono::system_clock::now() - item->mFirstEnqueTime) + .count(); + if (age > INT32_FLAG(discard_send_fail_interval)) { + shouldDiscard = true; + } + + if (shouldDiscard) { + LOG_WARNING(sLogger, + ("FlusherOTLPNative discard item", status.error_message())("code", code)("age_s", age)( + "config-flusher-dst", QueueKeyManager::GetInstance()->GetName(item->mQueueKey))); + ADD_COUNTER(mDiscardedEventsTotal, 1); + } + SenderQueueManager::GetInstance()->DecreaseConcurrencyLimiterInSendingCnt(item->mQueueKey); + DealSenderQueueItemAfterSend(item, !shouldDiscard); +} + +// ==================== HandleGrpcCallback (called by GrpcSink) ==================== + +void FlusherOTLPNative::HandleGrpcCallback(grpc::Status&& status, OTLPGrpcCallContext* ctx) { + UntrackContext(ctx->context.get()); + + if (status.ok()) { + ADD_COUNTER(mSendSuccessCnt, 1); + } else { + ADD_COUNTER(mSendFailCnt, 1); + if (status.error_code() == grpc::StatusCode::CANCELLED && mIsStopping.load()) { + LOG_INFO(sLogger, ("FlusherOTLPNative RPC cancelled during shutdown", "")); + } else { + LOG_WARNING(sLogger, + ("FlusherOTLPNative async Export failed", status.error_message())("code", status.error_code())); + } + } + + OnSendDone(status, ctx->item); + DecInFlight(); + delete ctx; +} + +// ==================== In-flight context tracking ==================== + +void FlusherOTLPNative::TrackContext(grpc::ClientContext* ctx) { + std::lock_guard lock(mContextsMutex); + mInFlightContexts.insert(ctx); +} + +void FlusherOTLPNative::UntrackContext(grpc::ClientContext* ctx) { + std::lock_guard lock(mContextsMutex); + mInFlightContexts.erase(ctx); +} + +void FlusherOTLPNative::CancelAllInFlight() { + std::lock_guard lock(mContextsMutex); + for (auto* ctx : mInFlightContexts) { + ctx->TryCancel(); + } +} + +// ==================== gRPC Channel ==================== + +bool FlusherOTLPNative::CreateGrpcChannel() { + try { + grpc::ChannelArguments args; + args.SetInt(GRPC_ARG_MAX_SEND_MESSAGE_LENGTH, 64 * 1024 * 1024); // 64MB + args.SetInt(GRPC_ARG_MAX_RECEIVE_MESSAGE_LENGTH, 64 * 1024 * 1024); + + std::shared_ptr creds; + if (mEnableTLS) { + creds = grpc::SslCredentials(grpc::SslCredentialsOptions()); + } else { + creds = grpc::InsecureChannelCredentials(); + } + + mChannel = grpc::CreateCustomChannel(mEndpoint, creds, args); + if (!mChannel) { + LOG_ERROR(sLogger, ("FlusherOTLPNative failed to create gRPC channel", mEndpoint)); + return false; + } + + mLogsStub = OTLPLogsService::NewStub(mChannel); + mMetricsStub = OTLPMetricsService::NewStub(mChannel); + mTraceStub = OTLPTraceService::NewStub(mChannel); + + LOG_INFO(sLogger, ("FlusherOTLPNative gRPC channel created", mEndpoint)); + return true; + } catch (const std::exception& e) { + LOG_ERROR(sLogger, ("FlusherOTLPNative exception creating gRPC channel", e.what())); + return false; + } +} + +} // namespace logtail diff --git a/core/plugin/flusher/opentelemetry/FlusherOTLPNative.h b/core/plugin/flusher/opentelemetry/FlusherOTLPNative.h new file mode 100644 index 0000000000..e0823db9f5 --- /dev/null +++ b/core/plugin/flusher/opentelemetry/FlusherOTLPNative.h @@ -0,0 +1,156 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "collection_pipeline/plugin/interface/Flusher.h" +#include "protobuf/opentelemetry/proto/collector/logs/v1/logs_service.grpc.pb.h" +#include "protobuf/opentelemetry/proto/collector/metrics/v1/metrics_service.grpc.pb.h" +#include "protobuf/opentelemetry/proto/collector/trace/v1/trace_service.grpc.pb.h" +#include "runner/sink/SinkType.h" + +namespace logtail { + +using OTLPLogsService = opentelemetry::proto::collector::logs::v1::LogsService; +using OTLPMetricsService = opentelemetry::proto::collector::metrics::v1::MetricsService; +using OTLPTraceService = opentelemetry::proto::collector::trace::v1::TraceService; + +// Context for a single async gRPC Export call. +// Allocated in BuildGrpcRequest, deleted in HandleGrpcCallback. +struct OTLPGrpcCallContext { + SenderQueueItem* item = nullptr; + std::unique_ptr context; + + enum class DataType : uint8_t { Logs, Metrics, Traces }; + DataType type = DataType::Logs; + + // Only the fields matching `type` are populated. + std::unique_ptr logsReq; + std::unique_ptr logsResp; + std::unique_ptr metricsReq; + std::unique_ptr metricsResp; + std::unique_ptr traceReq; + std::unique_ptr traceResp; +}; + +// Custom SenderQueueItem that carries OTLP data type info. +struct OTLPSenderQueueItem : public SenderQueueItem { + OTLPGrpcCallContext::DataType dataType; + + OTLPSenderQueueItem( + std::string&& data, size_t rawSize, Flusher* flusher, QueueKey key, OTLPGrpcCallContext::DataType type) + : SenderQueueItem(std::move(data), rawSize, flusher, key, RawDataType::EVENT_GROUP), dataType(type) {} + + OTLPSenderQueueItem(const OTLPSenderQueueItem& other) : SenderQueueItem(other), dataType(other.dataType) {} + + ~OTLPSenderQueueItem() override = default; + + OTLPSenderQueueItem* Clone() override { return new OTLPSenderQueueItem(*this); } +}; + +class FlusherOTLPNative : public Flusher { +public: + static const std::string sName; + + const std::string& Name() const override { return sName; } + bool Init(const Json::Value& config, Json::Value& optionalGoPipeline) override; + bool Start() override; + bool Stop(bool isPipelineRemoving) override; + bool Send(PipelineEventGroup&& g) override; + bool Flush(size_t /*key*/) override { return true; } + bool FlushAll() override { return true; } + + SinkType GetSinkType() override { return SinkType::GRPC; } + + // Build gRPC call context from SenderQueueItem binary data. + bool BuildGrpcRequest(SenderQueueItem* item, + std::unique_ptr& ctx, + bool* keepItem, + std::string* errMsg); + + // Called after async Export completes (handles retry/release). + void OnSendDone(const grpc::Status& status, SenderQueueItem* item); + + // Public getters for GrpcSink to access stubs. + OTLPLogsService::Stub* GetLogsStub() const { return mLogsStub.get(); } + OTLPMetricsService::Stub* GetMetricsStub() const { return mMetricsStub.get(); } + OTLPTraceService::Stub* GetTraceStub() const { return mTraceStub.get(); } + int32_t GetTimeoutMs() const { return mTimeoutMs; } + const std::unordered_map& GetHeaders() const { return mHeaders; } + + // Called by GrpcSink callback to handle the async Export result. + void HandleGrpcCallback(grpc::Status&& status, OTLPGrpcCallContext* ctx); + + // In-flight tracking for Stop() + void IncInFlight() { mInFlightCnt.fetch_add(1); } + void DecInFlight() { + mInFlightCnt.fetch_sub(1); + mStopCV.notify_all(); + } + int32_t InFlightCount() const { return mInFlightCnt.load(); } + + void TrackContext(grpc::ClientContext* ctx); + void UntrackContext(grpc::ClientContext* ctx); + void CancelAllInFlight(); + +private: + std::string mEndpoint; + int32_t mTimeoutMs = 30000; + std::unordered_map mHeaders; + bool mEnableTLS = false; + + std::shared_ptr mChannel; + std::unique_ptr mLogsStub; + std::unique_ptr mMetricsStub; + std::unique_ptr mTraceStub; + + std::atomic mInFlightCnt{0}; + std::mutex mStopMutex; + std::condition_variable mStopCV; + std::atomic mIsStopping{false}; + + std::mutex mContextsMutex; + std::set mInFlightContexts; + + CounterPtr mSendCnt; + CounterPtr mSendSuccessCnt; + CounterPtr mSendFailCnt; + CounterPtr mDiscardedEventsTotal; + + bool CreateGrpcChannel(); + + // Serialize PipelineEventGroup to OTLP protobuf binary string (by type). + bool SerializeLogsToOTLP(const PipelineEventGroup& group, std::string& serializedData, std::string& errMsg); + bool SerializeMetricsToOTLP(const PipelineEventGroup& group, std::string& serializedData, std::string& errMsg); + bool SerializeTracesToOTLP(const PipelineEventGroup& group, std::string& serializedData, std::string& errMsg); + +#ifdef APSARA_UNIT_TEST_MAIN + friend class FlusherOTLPNativeUnittest; + friend class GrpcSinkMock; +#endif +}; + +} // namespace logtail diff --git a/core/plugin/input/InputForward.cpp b/core/plugin/input/InputForward.cpp index eba92a1d21..1cc949bc3b 100644 --- a/core/plugin/input/InputForward.cpp +++ b/core/plugin/input/InputForward.cpp @@ -23,6 +23,7 @@ #include "common/ParamExtractor.h" #include "forward/GrpcInputManager.h" #include "forward/loongsuite/LoongSuiteForwardService.h" +#include "forward/otlp/OTLPForwardService.h" #include "logger/Logger.h" #include "plugin/processor/inner/ProcessorParseFromPBNative.h" @@ -32,7 +33,7 @@ const std::string InputForward::sName = "input_forward"; const std::unordered_set InputForward::sSupportedProtocols = { "LoongSuite", - // TODO: add more protocols here + "OTLP", }; bool InputForward::Init(const Json::Value& config, Json::Value& optionalGoPipeline) { @@ -96,6 +97,9 @@ bool InputForward::Start() { return false; } mInnerProcessors.emplace_back(std::move(processor)); + } else if (mProtocol == "OTLP") { + result = GrpcInputManager::GetInstance()->AddListenInput( + mConfigName, mEndpoint, mForwardConfig); } else { LOG_WARNING(sLogger, ("Protocol not fully implemented, should not happen", mProtocol)("config", mConfigName)); } diff --git a/core/protobuf/opentelemetry/proto/collector/logs/v1/logs_service.proto b/core/protobuf/opentelemetry/proto/collector/logs/v1/logs_service.proto new file mode 100644 index 0000000000..8260d8aaeb --- /dev/null +++ b/core/protobuf/opentelemetry/proto/collector/logs/v1/logs_service.proto @@ -0,0 +1,79 @@ +// Copyright 2020, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.collector.logs.v1; + +import "opentelemetry/proto/logs/v1/logs.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Collector.Logs.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.collector.logs.v1"; +option java_outer_classname = "LogsServiceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/collector/logs/v1"; + +// Service that can be used to push logs between one Application instrumented with +// OpenTelemetry and an collector, or between an collector and a central collector (in this +// case logs are sent/received to/from multiple Applications). +service LogsService { + // For performance reasons, it is recommended to keep this RPC + // alive for the entire life of the application. + rpc Export(ExportLogsServiceRequest) returns (ExportLogsServiceResponse) {} +} + +message ExportLogsServiceRequest { + // An array of ResourceLogs. + // For data coming from a single resource this array will typically contain one + // element. Intermediary nodes (such as OpenTelemetry Collector) that receive + // data from multiple origins typically batch the data before forwarding further and + // in that case this array will contain multiple elements. + repeated opentelemetry.proto.logs.v1.ResourceLogs resource_logs = 1; +} + +message ExportLogsServiceResponse { + // The details of a partially successful export request. + // + // If the request is only partially accepted + // (i.e. when the server accepts only parts of the data and rejects the rest) + // the server MUST initialize the `partial_success` field and MUST + // set the `rejected_` with the number of items it rejected. + // + // Servers MAY also make use of the `partial_success` field to convey + // warnings/suggestions to senders even when the request was fully accepted. + // In such cases, the `rejected_` MUST have a value of `0` and + // the `error_message` MUST be non-empty. + // + // A `partial_success` message with an empty value (rejected_ = 0 and + // `error_message` = "") is equivalent to it not being set/present. Senders + // SHOULD interpret it the same way as in the full success case. + ExportLogsPartialSuccess partial_success = 1; +} + +message ExportLogsPartialSuccess { + // The number of rejected log records. + // + // A `rejected_` field holding a `0` value indicates that the + // request was fully accepted. + int64 rejected_log_records = 1; + + // A developer-facing human-readable message in English. It should be used + // either to explain why the server rejected parts of the data during a partial + // success or to convey warnings/suggestions during a full success. The message + // should offer guidance on how users can address such issues. + // + // error_message is an optional field. An error_message with an empty value + // is equivalent to it not being set. + string error_message = 2; +} diff --git a/core/protobuf/opentelemetry/proto/collector/metrics/v1/metrics_service.proto b/core/protobuf/opentelemetry/proto/collector/metrics/v1/metrics_service.proto new file mode 100644 index 0000000000..dd48f1ad3a --- /dev/null +++ b/core/protobuf/opentelemetry/proto/collector/metrics/v1/metrics_service.proto @@ -0,0 +1,79 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.collector.metrics.v1; + +import "opentelemetry/proto/metrics/v1/metrics.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Collector.Metrics.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.collector.metrics.v1"; +option java_outer_classname = "MetricsServiceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/collector/metrics/v1"; + +// Service that can be used to push metrics between one Application +// instrumented with OpenTelemetry and a collector, or between a collector and a +// central collector. +service MetricsService { + // For performance reasons, it is recommended to keep this RPC + // alive for the entire life of the application. + rpc Export(ExportMetricsServiceRequest) returns (ExportMetricsServiceResponse) {} +} + +message ExportMetricsServiceRequest { + // An array of ResourceMetrics. + // For data coming from a single resource this array will typically contain one + // element. Intermediary nodes (such as OpenTelemetry Collector) that receive + // data from multiple origins typically batch the data before forwarding further and + // in that case this array will contain multiple elements. + repeated opentelemetry.proto.metrics.v1.ResourceMetrics resource_metrics = 1; +} + +message ExportMetricsServiceResponse { + // The details of a partially successful export request. + // + // If the request is only partially accepted + // (i.e. when the server accepts only parts of the data and rejects the rest) + // the server MUST initialize the `partial_success` field and MUST + // set the `rejected_` with the number of items it rejected. + // + // Servers MAY also make use of the `partial_success` field to convey + // warnings/suggestions to senders even when the request was fully accepted. + // In such cases, the `rejected_` MUST have a value of `0` and + // the `error_message` MUST be non-empty. + // + // A `partial_success` message with an empty value (rejected_ = 0 and + // `error_message` = "") is equivalent to it not being set/present. Senders + // SHOULD interpret it the same way as in the full success case. + ExportMetricsPartialSuccess partial_success = 1; +} + +message ExportMetricsPartialSuccess { + // The number of rejected data points. + // + // A `rejected_` field holding a `0` value indicates that the + // request was fully accepted. + int64 rejected_data_points = 1; + + // A developer-facing human-readable message in English. It should be used + // either to explain why the server rejected parts of the data during a partial + // success or to convey warnings/suggestions during a full success. The message + // should offer guidance on how users can address such issues. + // + // error_message is an optional field. An error_message with an empty value + // is equivalent to it not being set. + string error_message = 2; +} diff --git a/core/protobuf/opentelemetry/proto/collector/trace/v1/trace_service.proto b/core/protobuf/opentelemetry/proto/collector/trace/v1/trace_service.proto new file mode 100644 index 0000000000..d6fe67f9e5 --- /dev/null +++ b/core/protobuf/opentelemetry/proto/collector/trace/v1/trace_service.proto @@ -0,0 +1,79 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.collector.trace.v1; + +import "opentelemetry/proto/trace/v1/trace.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Collector.Trace.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.collector.trace.v1"; +option java_outer_classname = "TraceServiceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/collector/trace/v1"; + +// Service that can be used to push spans between one Application instrumented with +// OpenTelemetry and a collector, or between a collector and a central collector (in this +// case spans are sent/received to/from multiple Applications). +service TraceService { + // For performance reasons, it is recommended to keep this RPC + // alive for the entire life of the application. + rpc Export(ExportTraceServiceRequest) returns (ExportTraceServiceResponse) {} +} + +message ExportTraceServiceRequest { + // An array of ResourceSpans. + // For data coming from a single resource this array will typically contain one + // element. Intermediary nodes (such as OpenTelemetry Collector) that receive + // data from multiple origins typically batch the data before forwarding further and + // in that case this array will contain multiple elements. + repeated opentelemetry.proto.trace.v1.ResourceSpans resource_spans = 1; +} + +message ExportTraceServiceResponse { + // The details of a partially successful export request. + // + // If the request is only partially accepted + // (i.e. when the server accepts only parts of the data and rejects the rest) + // the server MUST initialize the `partial_success` field and MUST + // set the `rejected_` with the number of items it rejected. + // + // Servers MAY also make use of the `partial_success` field to convey + // warnings/suggestions to senders even when the request was fully accepted. + // In such cases, the `rejected_` MUST have a value of `0` and + // the `error_message` MUST be non-empty. + // + // A `partial_success` message with an empty value (rejected_ = 0 and + // `error_message` = "") is equivalent to it not being set/present. Senders + // SHOULD interpret it the same way as in the full success case. + ExportTracePartialSuccess partial_success = 1; +} + +message ExportTracePartialSuccess { + // The number of rejected spans. + // + // A `rejected_` field holding a `0` value indicates that the + // request was fully accepted. + int64 rejected_spans = 1; + + // A developer-facing human-readable message in English. It should be used + // either to explain why the server rejected parts of the data during a partial + // success or to convey warnings/suggestions during a full success. The message + // should offer guidance on how users can address such issues. + // + // error_message is an optional field. An error_message with an empty value + // is equivalent to it not being set. + string error_message = 2; +} diff --git a/core/protobuf/opentelemetry/proto/common/v1/common.proto b/core/protobuf/opentelemetry/proto/common/v1/common.proto new file mode 100644 index 0000000000..ff8a21a1fa --- /dev/null +++ b/core/protobuf/opentelemetry/proto/common/v1/common.proto @@ -0,0 +1,81 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.common.v1; + +option csharp_namespace = "OpenTelemetry.Proto.Common.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.common.v1"; +option java_outer_classname = "CommonProto"; +option go_package = "go.opentelemetry.io/proto/otlp/common/v1"; + +// AnyValue is used to represent any type of attribute value. AnyValue may contain a +// primitive value such as a string or integer or it may contain an arbitrary nested +// object containing arrays, key-value lists and primitives. +message AnyValue { + // The value is one of the listed fields. It is valid for all values to be unspecified + // in which case this AnyValue is considered to be "empty". + oneof value { + string string_value = 1; + bool bool_value = 2; + int64 int_value = 3; + double double_value = 4; + ArrayValue array_value = 5; + KeyValueList kvlist_value = 6; + bytes bytes_value = 7; + } +} + +// ArrayValue is a list of AnyValue messages. We need ArrayValue as a message +// since oneof in AnyValue does not allow repeated fields. +message ArrayValue { + // Array of values. The array may be empty (contain 0 elements). + repeated AnyValue values = 1; +} + +// KeyValueList is a list of KeyValue messages. We need KeyValueList as a message +// since `oneof` in AnyValue does not allow repeated fields. Everywhere else where we need +// a list of KeyValue messages (e.g. in Span) we use `repeated KeyValue` directly to +// avoid unnecessary extra wrapping (which slows down the protocol). The 2 approaches +// are semantically equivalent. +message KeyValueList { + // A collection of key/value pairs of key-value pairs. The list may be empty (may + // contain 0 elements). + // The keys MUST be unique (it is not allowed to have more than one + // value with the same key). + repeated KeyValue values = 1; +} + +// KeyValue is a key-value pair that is used to store Span attributes, Link +// attributes, etc. +message KeyValue { + string key = 1; + AnyValue value = 2; +} + +// InstrumentationScope is a message representing the instrumentation scope information +// such as the fully qualified name and version. +message InstrumentationScope { + // An empty instrumentation scope name means the name is unknown. + string name = 1; + string version = 2; + + // Additional attributes that describe the scope. [Optional]. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated KeyValue attributes = 3; + uint32 dropped_attributes_count = 4; +} diff --git a/core/protobuf/opentelemetry/proto/logs/v1/logs.proto b/core/protobuf/opentelemetry/proto/logs/v1/logs.proto new file mode 100644 index 0000000000..261d22916b --- /dev/null +++ b/core/protobuf/opentelemetry/proto/logs/v1/logs.proto @@ -0,0 +1,227 @@ +// Copyright 2020, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.logs.v1; + +import "opentelemetry/proto/common/v1/common.proto"; +import "opentelemetry/proto/resource/v1/resource.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Logs.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.logs.v1"; +option java_outer_classname = "LogsProto"; +option go_package = "go.opentelemetry.io/proto/otlp/logs/v1"; + +// LogsData represents the logs data that can be stored in a persistent storage, +// OR can be embedded by other protocols that transfer OTLP logs data but do not +// implement the OTLP protocol. +// +// The main difference between this message and collector protocol is that +// in this message there will not be any "control" or "metadata" specific to +// OTLP protocol. +// +// When new fields are added into this message, the OTLP request MUST be updated +// as well. +message LogsData { + // An array of ResourceLogs. + // For data coming from a single resource this array will typically contain + // one element. Intermediary nodes that receive data from multiple origins + // typically batch the data before forwarding further and in that case this + // array will contain multiple elements. + repeated ResourceLogs resource_logs = 1; +} + +// A collection of ScopeLogs from a Resource. +message ResourceLogs { + reserved 1000; + + // The resource for the logs in this message. + // If this field is not set then resource info is unknown. + opentelemetry.proto.resource.v1.Resource resource = 1; + + // A list of ScopeLogs that originate from a resource. + repeated ScopeLogs scope_logs = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the resource data + // is recorded in. Notably, the last part of the URL path is the version number of the + // schema: http[s]://server[:port]/path/. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to the data in the "resource" field. It does not apply + // to the data in the "scope_logs" field which have their own schema_url field. + string schema_url = 3; +} + +// A collection of Logs produced by a Scope. +message ScopeLogs { + // The instrumentation scope information for the logs in this message. + // Semantically when InstrumentationScope isn't set, it is equivalent with + // an empty instrumentation scope name (unknown). + opentelemetry.proto.common.v1.InstrumentationScope scope = 1; + + // A list of log records. + repeated LogRecord log_records = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the log data + // is recorded in. Notably, the last part of the URL path is the version number of the + // schema: http[s]://server[:port]/path/. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to all logs in the "logs" field. + string schema_url = 3; +} + +// Possible values for LogRecord.SeverityNumber. +enum SeverityNumber { + // UNSPECIFIED is the default SeverityNumber, it MUST NOT be used. + SEVERITY_NUMBER_UNSPECIFIED = 0; + SEVERITY_NUMBER_TRACE = 1; + SEVERITY_NUMBER_TRACE2 = 2; + SEVERITY_NUMBER_TRACE3 = 3; + SEVERITY_NUMBER_TRACE4 = 4; + SEVERITY_NUMBER_DEBUG = 5; + SEVERITY_NUMBER_DEBUG2 = 6; + SEVERITY_NUMBER_DEBUG3 = 7; + SEVERITY_NUMBER_DEBUG4 = 8; + SEVERITY_NUMBER_INFO = 9; + SEVERITY_NUMBER_INFO2 = 10; + SEVERITY_NUMBER_INFO3 = 11; + SEVERITY_NUMBER_INFO4 = 12; + SEVERITY_NUMBER_WARN = 13; + SEVERITY_NUMBER_WARN2 = 14; + SEVERITY_NUMBER_WARN3 = 15; + SEVERITY_NUMBER_WARN4 = 16; + SEVERITY_NUMBER_ERROR = 17; + SEVERITY_NUMBER_ERROR2 = 18; + SEVERITY_NUMBER_ERROR3 = 19; + SEVERITY_NUMBER_ERROR4 = 20; + SEVERITY_NUMBER_FATAL = 21; + SEVERITY_NUMBER_FATAL2 = 22; + SEVERITY_NUMBER_FATAL3 = 23; + SEVERITY_NUMBER_FATAL4 = 24; +} + +// LogRecordFlags represents constants used to interpret the +// LogRecord.flags field, which is protobuf 'fixed32' type and is to +// be used as bit-fields. Each non-zero value defined in this enum is +// a bit-mask. To extract the bit-field, for example, use an +// expression like: +// +// (logRecord.flags & LOG_RECORD_FLAGS_TRACE_FLAGS_MASK) +// +enum LogRecordFlags { + // The zero value for the enum. Should not be used for comparisons. + // Instead use bitwise "and" with the appropriate mask as shown above. + LOG_RECORD_FLAGS_DO_NOT_USE = 0; + + // Bits 0-7 are used for trace flags. + LOG_RECORD_FLAGS_TRACE_FLAGS_MASK = 0x000000FF; + + // Bits 8-31 are reserved for future use. +} + +// A log record according to OpenTelemetry Log Data Model: +// https://github.com/open-telemetry/oteps/blob/main/text/logs/0097-log-data-model.md +message LogRecord { + reserved 4; + + // time_unix_nano is the time when the event occurred. + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. + // Value of 0 indicates unknown or missing timestamp. + fixed64 time_unix_nano = 1; + + // Time when the event was observed by the collection system. + // For events that originate in OpenTelemetry (e.g. using OpenTelemetry Logging SDK) + // this timestamp is typically set at the generation time and is equal to Timestamp. + // For events originating externally and collected by OpenTelemetry (e.g. using + // Collector) this is the time when OpenTelemetry's code observed the event measured + // by the clock of the OpenTelemetry code. This field MUST be set once the event is + // observed by OpenTelemetry. + // + // For converting OpenTelemetry log data to formats that support only one timestamp or + // when receiving OpenTelemetry log data by recipients that support only one timestamp + // internally the following logic is recommended: + // - Use time_unix_nano if it is present, otherwise use observed_time_unix_nano. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. + // Value of 0 indicates unknown or missing timestamp. + fixed64 observed_time_unix_nano = 11; + + // Numerical value of the severity, normalized to values described in Log Data Model. + // [Optional]. + SeverityNumber severity_number = 2; + + // The severity text (also known as log level). The original string representation as + // it is known at the source. [Optional]. + string severity_text = 3; + + // A value containing the body of the log record. Can be for example a human-readable + // string message (including multi-line) describing the event in a free form or it can + // be a structured data composed of arrays and maps of other values. [Optional]. + opentelemetry.proto.common.v1.AnyValue body = 5; + + // Additional attributes that describe the specific event occurrence. [Optional]. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 6; + uint32 dropped_attributes_count = 7; + + // Flags, a bit field. 8 least significant bits are the trace flags as + // defined in W3C Trace Context specification. 24 most significant bits are reserved + // and must be set to 0. Readers must not assume that 24 most significant bits + // will be zero and must correctly mask the bits when reading 8-bit trace flag (use + // flags & LOG_RECORD_FLAGS_TRACE_FLAGS_MASK). [Optional]. + fixed32 flags = 8; + + // A unique identifier for a trace. All logs from the same trace share + // the same `trace_id`. The ID is a 16-byte array. An ID with all zeroes OR + // of length other than 16 bytes is considered invalid (empty string in OTLP/JSON + // is zero-length and thus is also invalid). + // + // This field is optional. + // + // The receivers SHOULD assume that the log record is not associated with a + // trace if any of the following is true: + // - the field is not present, + // - the field contains an invalid value. + bytes trace_id = 9; + + // A unique identifier for a span within a trace, assigned when the span + // is created. The ID is an 8-byte array. An ID with all zeroes OR of length + // other than 8 bytes is considered invalid (empty string in OTLP/JSON + // is zero-length and thus is also invalid). + // + // This field is optional. If the sender specifies a valid span_id then it SHOULD also + // specify a valid trace_id. + // + // The receivers SHOULD assume that the log record is not associated with a + // span if any of the following is true: + // - the field is not present, + // - the field contains an invalid value. + bytes span_id = 10; + + // A unique identifier of event category/type. + // All events with the same event_name are expected to conform to the same + // schema for both their attributes and their body. + // + // Recommended to be fully qualified and short (no longer than 256 characters). + // + // Presence of event_name on the log record identifies this record + // as an event. + // + // [Optional]. + // + // Status: [Development] + string event_name = 12; +} diff --git a/core/protobuf/opentelemetry/proto/metrics/v1/metrics.proto b/core/protobuf/opentelemetry/proto/metrics/v1/metrics.proto new file mode 100644 index 0000000000..00c5112ce8 --- /dev/null +++ b/core/protobuf/opentelemetry/proto/metrics/v1/metrics.proto @@ -0,0 +1,714 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.metrics.v1; + +import "opentelemetry/proto/common/v1/common.proto"; +import "opentelemetry/proto/resource/v1/resource.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Metrics.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.metrics.v1"; +option java_outer_classname = "MetricsProto"; +option go_package = "go.opentelemetry.io/proto/otlp/metrics/v1"; + +// MetricsData represents the metrics data that can be stored in a persistent +// storage, OR can be embedded by other protocols that transfer OTLP metrics +// data but do not implement the OTLP protocol. +// +// MetricsData +// └─── ResourceMetrics +// ├── Resource +// ├── SchemaURL +// └── ScopeMetrics +// ├── Scope +// ├── SchemaURL +// └── Metric +// ├── Name +// ├── Description +// ├── Unit +// └── data +// ├── Gauge +// ├── Sum +// ├── Histogram +// ├── ExponentialHistogram +// └── Summary +// +// The main difference between this message and collector protocol is that +// in this message there will not be any "control" or "metadata" specific to +// OTLP protocol. +// +// When new fields are added into this message, the OTLP request MUST be updated +// as well. +message MetricsData { + // An array of ResourceMetrics. + // For data coming from a single resource this array will typically contain + // one element. Intermediary nodes that receive data from multiple origins + // typically batch the data before forwarding further and in that case this + // array will contain multiple elements. + repeated ResourceMetrics resource_metrics = 1; +} + +// A collection of ScopeMetrics from a Resource. +message ResourceMetrics { + reserved 1000; + + // The resource for the metrics in this message. + // If this field is not set then no resource info is known. + opentelemetry.proto.resource.v1.Resource resource = 1; + + // A list of metrics that originate from a resource. + repeated ScopeMetrics scope_metrics = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the resource data + // is recorded in. Notably, the last part of the URL path is the version number of the + // schema: http[s]://server[:port]/path/. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to the data in the "resource" field. It does not apply + // to the data in the "scope_metrics" field which have their own schema_url field. + string schema_url = 3; +} + +// A collection of Metrics produced by an Scope. +message ScopeMetrics { + // The instrumentation scope information for the metrics in this message. + // Semantically when InstrumentationScope isn't set, it is equivalent with + // an empty instrumentation scope name (unknown). + opentelemetry.proto.common.v1.InstrumentationScope scope = 1; + + // A list of metrics that originate from an instrumentation library. + repeated Metric metrics = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the metric data + // is recorded in. Notably, the last part of the URL path is the version number of the + // schema: http[s]://server[:port]/path/. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to all metrics in the "metrics" field. + string schema_url = 3; +} + +// Defines a Metric which has one or more timeseries. The following is a +// brief summary of the Metric data model. For more details, see: +// +// https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/data-model.md +// +// The data model and relation between entities is shown in the +// diagram below. Here, "DataPoint" is the term used to refer to any +// one of the specific data point value types, and "points" is the term used +// to refer to any one of the lists of points contained in the Metric. +// +// - Metric is composed of a metadata and data. +// - Metadata part contains a name, description, unit. +// - Data is one of the possible types (Sum, Gauge, Histogram, Summary). +// - DataPoint contains timestamps, attributes, and one of the possible value type +// fields. +// +// Metric +// +------------+ +// |name | +// |description | +// |unit | +------------------------------------+ +// |data |---> |Gauge, Sum, Histogram, Summary, ... | +// +------------+ +------------------------------------+ +// +// Data [One of Gauge, Sum, Histogram, Summary, ...] +// +-----------+ +// |... | // Metadata about the Data. +// |points |--+ +// +-----------+ | +// | +---------------------------+ +// | |DataPoint 1 | +// v |+------+------+ +------+ | +// +-----+ ||label |label |...|label | | +// | 1 |-->||value1|value2|...|valueN| | +// +-----+ |+------+------+ +------+ | +// | . | |+-----+ | +// | . | ||value| | +// | . | |+-----+ | +// | . | +---------------------------+ +// | . | . +// | . | . +// | . | . +// | . | +---------------------------+ +// | . | |DataPoint M | +// +-----+ |+------+------+ +------+ | +// | M |-->||label |label |...|label | | +// +-----+ ||value1|value2|...|valueN| | +// |+------+------+ +------+ | +// |+-----+ | +// ||value| | +// |+-----+ | +// +---------------------------+ +// +// Each distinct type of DataPoint represents the output of a specific +// aggregation function, the result of applying the DataPoint's +// associated function of to one or more measurements. +// +// All DataPoint types have three common fields: +// - Attributes includes key-value pairs associated with the data point +// - TimeUnixNano is required, set to the end time of the aggregation +// - StartTimeUnixNano is optional, but strongly encouraged for DataPoints +// having an AggregationTemporality field, as discussed below. +// +// Both TimeUnixNano and StartTimeUnixNano values are expressed as +// UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. +// +// # TimeUnixNano +// +// This field is required, having consistent interpretation across +// DataPoint types. TimeUnixNano is the moment corresponding to when +// the data point's aggregate value was captured. +// +// Data points with the 0 value for TimeUnixNano SHOULD be rejected +// by consumers. +// +// # StartTimeUnixNano +// +// StartTimeUnixNano in general allows detecting when a sequence of +// observations is unbroken. This field indicates to consumers the +// start time for points with cumulative and delta +// AggregationTemporality, and it should be included whenever possible +// to support correct rate calculation. Although it may be omitted +// when the start time is truly unknown, setting StartTimeUnixNano is +// strongly encouraged. +message Metric { + reserved 4, 6, 8; + + // name of the metric. + string name = 1; + + // description of the metric, which can be used in documentation. + string description = 2; + + // unit in which the metric value is reported. Follows the format + // described by http://unitsofmeasure.org/ucum.html. + string unit = 3; + + // Data determines the aggregation type (if any) of the metric, what is the + // reported value type for the data points, as well as the relatationship to + // the time interval over which they are reported. + oneof data { + Gauge gauge = 5; + Sum sum = 7; + Histogram histogram = 9; + ExponentialHistogram exponential_histogram = 10; + Summary summary = 11; + } + + // Additional metadata attributes that describe the metric. [Optional]. + // Attributes are non-identifying. + // Consumers SHOULD NOT need to be aware of these attributes. + // These attributes MAY be used to encode information allowing + // for lossless roundtrip translation to / from another data model. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue metadata = 12; +} + +// Gauge represents the type of a scalar metric that always exports the +// "current value" for every data point. It should be used for an "unknown" +// aggregation. +// +// A Gauge does not support different aggregation temporalities. Given the +// aggregation is unknown, points cannot be combined using the same +// aggregation, regardless of aggregation temporalities. Therefore, +// AggregationTemporality is not included. Consequently, this also means +// "StartTimeUnixNano" is ignored for all data points. +message Gauge { + repeated NumberDataPoint data_points = 1; +} + +// Sum represents the type of a scalar metric that is calculated as a sum of all +// reported measurements over a time interval. +message Sum { + repeated NumberDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; + + // If "true" means that the sum is monotonic. + bool is_monotonic = 3; +} + +// Histogram represents the type of a metric that is calculated by aggregating +// as a Histogram of all reported measurements over a time interval. +message Histogram { + repeated HistogramDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; +} + +// ExponentialHistogram represents the type of a metric that is calculated by aggregating +// as a ExponentialHistogram of all reported double measurements over a time interval. +message ExponentialHistogram { + repeated ExponentialHistogramDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; +} + +// Summary metric data are used to convey quantile summaries, +// a Prometheus (see: https://prometheus.io/docs/concepts/metric_types/#summary) +// and OpenMetrics (see: https://github.com/OpenObservability/OpenMetrics/blob/4dbf6075567ab43296eed941037c12951faafb92/protos/prometheus.proto#L45) +// data type. These data points cannot always be merged in a meaningful way. +// While they can be useful in some applications, histogram data points are +// recommended for new applications. +// Summary metrics do not have an aggregation temporality field. This is +// because the count and sum fields of a SummaryDataPoint are assumed to be +// cumulative values. +message Summary { + repeated SummaryDataPoint data_points = 1; +} + +// AggregationTemporality defines how a metric aggregator reports aggregated +// values. It describes how those values relate to the time interval over +// which they are aggregated. +enum AggregationTemporality { + // UNSPECIFIED is the default AggregationTemporality, it MUST not be used. + AGGREGATION_TEMPORALITY_UNSPECIFIED = 0; + + // DELTA is an AggregationTemporality for a metric aggregator which reports + // changes since last report time. Successive metrics contain aggregation of + // values from continuous and non-overlapping intervals. + // + // The values for a DELTA metric are based only on the time interval + // associated with one measurement cycle. There is no dependency on + // previous measurements like is the case for CUMULATIVE metrics. + // + // For example, consider a system measuring the number of requests that + // it receives and reports the sum of these requests every second as a + // DELTA metric: + // + // 1. The system starts receiving at time=t_0. + // 2. A request is received, the system measures 1 request. + // 3. A request is received, the system measures 1 request. + // 4. A request is received, the system measures 1 request. + // 5. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+1 with a value of 3. + // 6. A request is received, the system measures 1 request. + // 7. A request is received, the system measures 1 request. + // 8. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0+1 to + // t_0+2 with a value of 2. + AGGREGATION_TEMPORALITY_DELTA = 1; + + // CUMULATIVE is an AggregationTemporality for a metric aggregator which + // reports changes since a fixed start time. This means that current values + // of a CUMULATIVE metric depend on all previous measurements since the + // start time. Because of this, the sender is required to retain this state + // in some form. If this state is lost or invalidated, the CUMULATIVE metric + // values MUST be reset and a new fixed start time following the last + // reported measurement time sent MUST be used. + // + // For example, consider a system measuring the number of requests that + // it receives and reports the sum of these requests every second as a + // CUMULATIVE metric: + // + // 1. The system starts receiving at time=t_0. + // 2. A request is received, the system measures 1 request. + // 3. A request is received, the system measures 1 request. + // 4. A request is received, the system measures 1 request. + // 5. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+1 with a value of 3. + // 6. A request is received, the system measures 1 request. + // 7. A request is received, the system measures 1 request. + // 8. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+2 with a value of 5. + // 9. The system experiences a fault and loses state. + // 10. The system recovers and resumes receiving at time=t_1. + // 11. A request is received, the system measures 1 request. + // 12. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_1 to + // t_0+1 with a value of 1. + // + // Note: Even though, when reporting changes since last report time, using + // CUMULATIVE is valid, it is not recommended. This may cause problems for + // systems that do not use start_time to determine when the aggregation + // value was reset (e.g. Prometheus). + AGGREGATION_TEMPORALITY_CUMULATIVE = 2; +} + +// DataPointFlags is defined as a protobuf 'uint32' type and is to be used as a +// bit-field representing 32 distinct boolean flags. Each flag defined in this +// enum is a bit-mask. To test the presence of a single flag in the flags of +// a data point, for example, use an expression like: +// +// (point.flags & DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK) == DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK +// +enum DataPointFlags { + // The zero value for the enum. Should not be used for comparisons. + // Instead use bitwise "and" with the appropriate mask as shown above. + DATA_POINT_FLAGS_DO_NOT_USE = 0; + + // This DataPoint is valid but has no recorded value. This value + // SHOULD be used to reflect explicitly missing data in a series, as + // for an equivalent to the Prometheus "staleness marker". + DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK = 1; + + // Bits 2-31 are reserved for future use. +} + +// NumberDataPoint is a single data point in a timeseries that describes the +// time-varying scalar value of a metric. +message NumberDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 7; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // The value itself. A point is considered invalid when one of the recognized + // value fields is not present inside this oneof. + oneof value { + double as_double = 4; + sfixed64 as_int = 6; + } + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 5; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 8; +} + +// HistogramDataPoint is a single data point in a timeseries that describes the +// time-varying values of a Histogram. A Histogram contains summary statistics +// for a population of values, it may optionally contain the distribution of +// those values across a set of buckets. +// +// If the histogram contains the distribution of values, then both +// "explicit_bounds" and "bucket counts" fields must be defined. +// If the histogram does not contain the distribution of values, then both +// "explicit_bounds" and "bucket_counts" must be omitted and only "count" and +// "sum" are known. +message HistogramDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 9; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be non-negative. This + // value must be equal to the sum of the "count" fields in buckets if a + // histogram is provided. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#histogram + optional double sum = 5; + + // bucket_counts is an optional field contains the count values of histogram + // for each bucket. + // + // The sum of the bucket_counts must equal the value in the count field. + // + // The number of elements in bucket_counts array must be by one greater than + // the number of elements in explicit_bounds array. + repeated fixed64 bucket_counts = 6; + + // explicit_bounds specifies buckets with explicitly defined bounds for values. + // + // The boundaries for bucket at index i are: + // + // (-infinity, explicit_bounds[i]] for i == 0 + // (explicit_bounds[i-1], explicit_bounds[i]] for 0 < i < size(explicit_bounds) + // (explicit_bounds[i-1], +infinity) for i == size(explicit_bounds) + // + // The values in the explicit_bounds array must be strictly increasing. + // + // Histogram buckets are inclusive of their upper boundary, except the last + // bucket where the boundary is at infinity. This format is intentionally + // compatible with the OpenMetrics histogram definition. + repeated double explicit_bounds = 7; + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 8; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 10; + + // min is the minimum value over (start_time, end_time]. + optional double min = 11; + + // max is the maximum value over (start_time, end_time]. + optional double max = 12; +} + +// ExponentialHistogramDataPoint is a single data point in a timeseries that describes the +// time-varying values of a ExponentialHistogram of double values. A ExponentialHistogram contains +// summary statistics for a population of values, it may optionally contain the +// distribution of those values across a set of buckets. +// +message ExponentialHistogramDataPoint { + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 1; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be + // non-negative. This value must be equal to the sum of the "bucket_counts" + // values in the positive and negative Buckets plus the "zero_count" field. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#histogram + optional double sum = 5; + + // scale describes the resolution of the histogram. Boundaries are + // located at powers of the base, where: + // + // base = (2^(2^-scale)) + // + // The histogram bucket identified by `index`, a signed integer, + // contains values that are greater than (base^index) and + // less than or equal to (base^(index+1)). + // + // The positive and negative ranges of the histogram are expressed + // separately. Negative values are mapped by their absolute value + // into the negative range using the same scale as the positive range. + // + // scale is not restricted by the protocol, as the permissible + // values depend on the range of the data. + sint32 scale = 6; + + // zero_count is the count of values that are either exactly zero or + // within the region considered zero by the instrumentation at the + // tolerated degree of precision. This bucket stores values that + // cannot be expressed using the standard exponential formula as + // well as values that have been rounded to zero. + // + // Implementations MAY consider the zero bucket to have probability + // mass equal to (zero_count / count). + fixed64 zero_count = 7; + + // positive carries the positive range of exponential bucket counts. + Buckets positive = 8; + + // negative carries the negative range of exponential bucket counts. + Buckets negative = 9; + + // Buckets are a set of bucket counts, encoded in a contiguous array + // of counts. + message Buckets { + // Offset is the bucket index of the first entry in the bucket_counts array. + // + // Note: This uses a varint encoding as a simple form of compression. + sint32 offset = 1; + + // bucket_counts is an array of count values, where bucket_counts[i] carries + // the count of the bucket at index (offset+i). bucket_counts[i] is the count + // of values greater than base^(offset+i) and less than or equal to + // base^(offset+i+1). + // + // Note: By contrast, the explicit HistogramDataPoint uses + // fixed64. This field is expected to have many buckets, + // especially zeros, so uint64 has been selected to ensure + // varint encoding. + repeated uint64 bucket_counts = 2; + } + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 10; + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 11; + + // min is the minimum value over (start_time, end_time]. + optional double min = 12; + + // max is the maximum value over (start_time, end_time]. + optional double max = 13; + + // ZeroThreshold may be optionally set to convey the width of the zero + // region. Where the zero region is defined as the closed interval + // [-ZeroThreshold, ZeroThreshold]. + // When ZeroThreshold is 0, zero count bucket stores values that cannot be + // expressed using the standard exponential formula as well as values that + // have been rounded to zero. + double zero_threshold = 14; +} + +// SummaryDataPoint is a single data point in a timeseries that describes the +// time-varying values of a Summary metric. The count and sum fields represent +// cumulative values. +message SummaryDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 7; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be non-negative. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#summary + double sum = 5; + + // Represents the value at a given quantile of a distribution. + // + // To record Min and Max values following conventions are used: + // - The 1.0 quantile is equivalent to the maximum value observed. + // - The 0.0 quantile is equivalent to the minimum value observed. + // + // See the following issue for more context: + // https://github.com/open-telemetry/opentelemetry-proto/issues/125 + message ValueAtQuantile { + // The quantile of a distribution. Must be in the interval + // [0.0, 1.0]. + double quantile = 1; + + // The value at the given quantile of a distribution. + // + // Quantile values must NOT be negative. + double value = 2; + } + + // (Optional) list of values at different quantiles of the distribution calculated + // from the current snapshot. The quantiles must be strictly increasing. + repeated ValueAtQuantile quantile_values = 6; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 8; +} + +// A representation of an exemplar, which is a sample input measurement. +// Exemplars also hold information about the environment when the measurement +// was recorded, for example the span and trace ID of the active span when the +// exemplar was recorded. +message Exemplar { + reserved 1; + + // The set of key/value pairs that were filtered out by the aggregator, but + // recorded alongside the original measurement. Only key/value pairs that were + // filtered out by the aggregator should be included + repeated opentelemetry.proto.common.v1.KeyValue filtered_attributes = 7; + + // time_unix_nano is the exact time when this exemplar was recorded + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 2; + + // The value of the measurement that was recorded. An exemplar is + // considered invalid when one of the recognized value fields is not present + // inside this oneof. + oneof value { + double as_double = 3; + sfixed64 as_int = 6; + } + + // (Optional) Span ID of the exemplar trace. + // span_id may be missing if the measurement is not recorded inside a trace + // or if the trace is not sampled. + bytes span_id = 4; + + // (Optional) Trace ID of the exemplar trace. + // trace_id may be missing if the measurement is not recorded inside a trace + // or if the trace is not sampled. + bytes trace_id = 5; +} diff --git a/core/protobuf/opentelemetry/proto/resource/v1/resource.proto b/core/protobuf/opentelemetry/proto/resource/v1/resource.proto new file mode 100644 index 0000000000..6637560bc3 --- /dev/null +++ b/core/protobuf/opentelemetry/proto/resource/v1/resource.proto @@ -0,0 +1,37 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.resource.v1; + +import "opentelemetry/proto/common/v1/common.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Resource.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.resource.v1"; +option java_outer_classname = "ResourceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/resource/v1"; + +// Resource information. +message Resource { + // Set of attributes that describe the resource. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 1; + + // dropped_attributes_count is the number of dropped attributes. If the value is 0, then + // no attributes were dropped. + uint32 dropped_attributes_count = 2; +} diff --git a/core/protobuf/opentelemetry/proto/trace/v1/trace.proto b/core/protobuf/opentelemetry/proto/trace/v1/trace.proto new file mode 100644 index 0000000000..24442853ed --- /dev/null +++ b/core/protobuf/opentelemetry/proto/trace/v1/trace.proto @@ -0,0 +1,357 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.trace.v1; + +import "opentelemetry/proto/common/v1/common.proto"; +import "opentelemetry/proto/resource/v1/resource.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Trace.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.trace.v1"; +option java_outer_classname = "TraceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/trace/v1"; + +// TracesData represents the traces data that can be stored in a persistent storage, +// OR can be embedded by other protocols that transfer OTLP traces data but do +// not implement the OTLP protocol. +// +// The main difference between this message and collector protocol is that +// in this message there will not be any "control" or "metadata" specific to +// OTLP protocol. +// +// When new fields are added into this message, the OTLP request MUST be updated +// as well. +message TracesData { + // An array of ResourceSpans. + // For data coming from a single resource this array will typically contain + // one element. Intermediary nodes that receive data from multiple origins + // typically batch the data before forwarding further and in that case this + // array will contain multiple elements. + repeated ResourceSpans resource_spans = 1; +} + +// A collection of ScopeSpans from a Resource. +message ResourceSpans { + reserved 1000; + + // The resource for the spans in this message. + // If this field is not set then no resource info is known. + opentelemetry.proto.resource.v1.Resource resource = 1; + + // A list of ScopeSpans that originate from a resource. + repeated ScopeSpans scope_spans = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the resource data + // is recorded in. Notably, the last part of the URL path is the version number of the + // schema: http[s]://server[:port]/path/. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to the data in the "resource" field. It does not apply + // to the data in the "scope_spans" field which have their own schema_url field. + string schema_url = 3; +} + +// A collection of Spans produced by an InstrumentationScope. +message ScopeSpans { + // The instrumentation scope information for the spans in this message. + // Semantically when InstrumentationScope isn't set, it is equivalent with + // an empty instrumentation scope name (unknown). + opentelemetry.proto.common.v1.InstrumentationScope scope = 1; + + // A list of Spans that originate from an instrumentation scope. + repeated Span spans = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the span data + // is recorded in. Notably, the last part of the URL path is the version number of the + // schema: http[s]://server[:port]/path/. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to all spans and span events in the "spans" field. + string schema_url = 3; +} + +// A Span represents a single operation performed by a single component of the system. +// +// The next available field id is 17. +message Span { + // A unique identifier for a trace. All spans from the same trace share + // the same `trace_id`. The ID is a 16-byte array. An ID with all zeroes OR + // of length other than 16 bytes is considered invalid (empty string in OTLP/JSON + // is zero-length and thus is also invalid). + // + // This field is required. + bytes trace_id = 1; + + // A unique identifier for a span within a trace, assigned when the span + // is created. The ID is an 8-byte array. An ID with all zeroes OR of length + // other than 8 bytes is considered invalid (empty string in OTLP/JSON + // is zero-length and thus is also invalid). + // + // This field is required. + bytes span_id = 2; + + // trace_state conveys information about request position in multiple distributed tracing graphs. + // It is a trace_state in w3c-trace-context format: https://www.w3.org/TR/trace-context/#tracestate-header + // See also https://github.com/w3c/distributed-tracing for more details about this field. + string trace_state = 3; + + // The `span_id` of this span's parent span. If this is a root span, then this + // field must be empty. The ID is an 8-byte array. + bytes parent_span_id = 4; + + // Flags, a bit field. + // + // Bits 0-7 (8 least significant bits) are the trace flags as defined in W3C Trace + // Context specification. To read the 8-bit W3C trace flag, use + // `flags & SPAN_FLAGS_TRACE_FLAGS_MASK`. + // + // See https://www.w3.org/TR/trace-context-2/#trace-flags for the flag definitions. + // + // Bits 8 and 9 represent the 3 states of whether a span's parent + // is remote. The states are (unknown, is not remote, is remote). + // To read whether the value is known, use `(flags & SPAN_FLAGS_CONTEXT_HAS_IS_REMOTE_MASK) != 0`. + // To read whether the span is remote, use `(flags & SPAN_FLAGS_CONTEXT_IS_REMOTE_MASK) != 0`. + // + // When creating span messages, if the message is logically forwarded from another source + // with an equivalent flags fields (i.e., usually another OTLP span message), the field SHOULD + // be copied as-is. If creating from a source that does not have an equivalent flags field + // (such as a runtime representation of an OpenTelemetry span), the high 22 bits MUST + // be set to zero. + // Readers MUST NOT assume that bits 10-31 (22 most significant bits) will be zero. + // + // [Optional]. + fixed32 flags = 16; + + // A description of the span's operation. + // + // For example, the name can be a qualified method name or a file name + // and a line number where the operation is called. A best practice is to use + // the same display name at the same call point in an application. + // This makes it easier to correlate spans in different traces. + // + // This field is semantically required to be set to non-empty string. + // Empty value is equivalent to an unknown span name. + // + // This field is required. + string name = 5; + + // SpanKind is the type of span. Can be used to specify additional relationships between spans + // in addition to a parent/child relationship. + enum SpanKind { + // Unspecified. Do NOT use as default. + // Implementations MAY assume SpanKind to be INTERNAL when receiving UNSPECIFIED. + SPAN_KIND_UNSPECIFIED = 0; + + // Indicates that the span represents an internal operation within an application, + // as opposed to an operation happening at the boundaries. Default value. + SPAN_KIND_INTERNAL = 1; + + // Indicates that the span covers server-side handling of an RPC or other + // remote network request. + SPAN_KIND_SERVER = 2; + + // Indicates that the span describes a request to some remote service. + SPAN_KIND_CLIENT = 3; + + // Indicates that the span describes a producer sending a message to a broker. + // Unlike CLIENT and SERVER, there is often no direct critical path latency relationship + // between producer and consumer spans. A PRODUCER span ends when the message was accepted + // by the broker while the logical processing of the message might span a much longer time. + SPAN_KIND_PRODUCER = 4; + + // Indicates that the span describes consumer receiving a message from a broker. + // Like the PRODUCER kind, there is often no direct critical path latency relationship + // between producer and consumer spans. + SPAN_KIND_CONSUMER = 5; + } + + // Distinguishes between spans generated in a particular context. For example, + // two spans with the same name may be distinguished using `CLIENT` (caller) + // and `SERVER` (callee) to identify queueing latency associated with the span. + SpanKind kind = 6; + + // start_time_unix_nano is the start time of the span. On the client side, this is the time + // kept by the local machine where the span execution starts. On the server side, this + // is the time when the server's application handler starts running. + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. + // + // This field is semantically required and it is expected that end_time >= start_time. + fixed64 start_time_unix_nano = 7; + + // end_time_unix_nano is the end time of the span. On the client side, this is the time + // kept by the local machine where the span execution ends. On the server side, this + // is the time when the server application handler stops running. + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. + // + // This field is semantically required and it is expected that end_time >= start_time. + fixed64 end_time_unix_nano = 8; + + // attributes is a collection of key/value pairs. Note, global attributes + // like server name can be set using the resource API. Examples of attributes: + // + // "/http/user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" + // "/http/server_latency": 300 + // "example.com/myattribute": true + // "example.com/score": 10.239 + // + // The OpenTelemetry API specification further restricts the allowed value types: + // https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/common/README.md#attribute + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 9; + + // dropped_attributes_count is the number of attributes that were discarded. Attributes + // can be discarded because their keys are too long or because there are too many + // attributes. If this value is 0, then no attributes were dropped. + uint32 dropped_attributes_count = 10; + + // Event is a time-stamped annotation of the span, consisting of user-supplied + // text description and key-value pairs. + message Event { + // time_unix_nano is the time the event occurred. + fixed64 time_unix_nano = 1; + + // name of the event. + // This field is semantically required to be set to non-empty string. + string name = 2; + + // attributes is a collection of attribute key/value pairs on the event. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 3; + + // dropped_attributes_count is the number of dropped attributes. If the value is 0, + // then no attributes were dropped. + uint32 dropped_attributes_count = 4; + } + + // events is a collection of Event items. + repeated Event events = 11; + + // dropped_events_count is the number of dropped events. If the value is 0, then no + // events were dropped. + uint32 dropped_events_count = 12; + + // A pointer from the current span to another span in the same trace or in a + // different trace. For example, this can be used in batching operations, + // where a single batch handler processes multiple requests from different + // traces or when the handler receives a request from a different project. + message Link { + // A unique identifier of a trace that this linked span is part of. The ID is a + // 16-byte array. + bytes trace_id = 1; + + // A unique identifier for the linked span. The ID is an 8-byte array. + bytes span_id = 2; + + // The trace_state associated with the link. + string trace_state = 3; + + // attributes is a collection of attribute key/value pairs on the link. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 4; + + // dropped_attributes_count is the number of dropped attributes. If the value is 0, + // then no attributes were dropped. + uint32 dropped_attributes_count = 5; + + // Flags, a bit field. + // + // Bits 0-7 (8 least significant bits) are the trace flags as defined in W3C Trace + // Context specification. To read the 8-bit W3C trace flag, use + // `flags & SPAN_FLAGS_TRACE_FLAGS_MASK`. + // + // See https://www.w3.org/TR/trace-context-2/#trace-flags for the flag definitions. + // + // Bits 8 and 9 represent the 3 states of whether the link is remote. + // The states are (unknown, is not remote, is remote). + // To read whether the value is known, use `(flags & SPAN_FLAGS_CONTEXT_HAS_IS_REMOTE_MASK) != 0`. + // To read whether the link is remote, use `(flags & SPAN_FLAGS_CONTEXT_IS_REMOTE_MASK) != 0`. + // + // Readers MUST NOT assume that bits 10-31 (22 most significant bits) will be zero. + // When creating new spans, bits 10-31 (most-significant 22-bits) MUST be zero. + // + // [Optional]. + fixed32 flags = 6; + } + + // links is a collection of Links, which are references from this span to a span + // in the same or different trace. + repeated Link links = 13; + + // dropped_links_count is the number of dropped links after the maximum size was + // enforced. If this value is 0, then no links were dropped. + uint32 dropped_links_count = 14; + + // An optional final status for this span. Semantically when Status isn't set, it means + // span's status code is unset, i.e. assume STATUS_CODE_UNSET (code = 0). + Status status = 15; +} + +// The Status type defines a logical error model that is suitable for different +// programming environments, including REST APIs and RPC APIs. +message Status { + reserved 1; + + // A developer-facing human readable error message. + string message = 2; + + // For the semantics of status codes see + // https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/api.md#set-status + enum StatusCode { + // The default status. + STATUS_CODE_UNSET = 0; + // The Span has been validated by an Application developer or Operator to + // have completed successfully. + STATUS_CODE_OK = 1; + // The Span contains an error. + STATUS_CODE_ERROR = 2; + }; + + // The status code. + StatusCode code = 3; +} + +// SpanFlags represents constants used to interpret the +// Span.flags field, which is protobuf 'fixed32' type and is to +// be used as bit-fields. Each non-zero value defined in this enum is +// a bit-mask. To extract the bit-field, for example, use an +// expression like: +// +// (span.flags & SPAN_FLAGS_TRACE_FLAGS_MASK) +// +// See https://www.w3.org/TR/trace-context-2/#trace-flags for the flag definitions. +// +// Note that Span flags were introduced in version 1.1 of the +// OpenTelemetry protocol. Older Span producers do not set this +// field, consequently consumers should not rely on the absence of a +// particular flag bit to indicate the presence of a particular feature. +enum SpanFlags { + // The zero value for the enum. Should not be used for comparisons. + // Instead use bitwise "and" with the appropriate mask as shown above. + SPAN_FLAGS_DO_NOT_USE = 0; + + // Bits 0-7 are used for trace flags. + SPAN_FLAGS_TRACE_FLAGS_MASK = 0x000000FF; + + // Bits 8 and 9 are used to indicate that the parent span or link span is remote. + // Bit 8 (`HAS_IS_REMOTE`) indicates whether the value is known. + // Bit 9 (`IS_REMOTE`) indicates whether the span or link is remote. + SPAN_FLAGS_CONTEXT_HAS_IS_REMOTE_MASK = 0x00000100; + SPAN_FLAGS_CONTEXT_IS_REMOTE_MASK = 0x00000200; + + // Bits 10-31 are reserved for future use. +} diff --git a/core/runner/FlusherRunner.cpp b/core/runner/FlusherRunner.cpp index f1f04063fe..07ef40bf63 100644 --- a/core/runner/FlusherRunner.cpp +++ b/core/runner/FlusherRunner.cpp @@ -29,6 +29,10 @@ #include "monitor/AlarmManager.h" #include "plugin/flusher/sls/DiskBufferWriter.h" #include "runner/sink/http/HttpSink.h" +#if defined(__linux__) && !defined(__ANDROID__) +#include "plugin/flusher/opentelemetry/FlusherOTLPNative.h" +#include "runner/sink/grpc/GrpcSink.h" +#endif DEFINE_FLAG_INT32(flusher_runner_exit_timeout_sec, "", 60); @@ -114,6 +118,53 @@ void FlusherRunner::DecreaseHttpSendingCnt() { SenderQueueManager::GetInstance()->Trigger(); } +#if defined(__linux__) && !defined(__ANDROID__) +bool FlusherRunner::PushToGrpcSink(SenderQueueItem* item, bool withLimit) { + while (withLimit && !Application::GetInstance()->IsExiting() + && GrpcSink::GetInstance()->GetInFlightCount() + >= AppConfig::GetInstance()->GetSendRequestGlobalConcurrency()) { + this_thread::sleep_for(chrono::milliseconds(10)); + } + + unique_ptr ctx; + bool keepItem = false; + string errMsg; + if (!static_cast(item->mFlusher)->BuildGrpcRequest(item, ctx, &keepItem, &errMsg)) { + if (keepItem + && chrono::duration_cast(chrono::system_clock::now() - item->mFirstEnqueTime).count() + < INT32_FLAG(discard_send_fail_interval)) { + item->mStatus = SendingStatus::IDLE; + ++item->mTryCnt; + const int64_t kInitialBackoffMs = 100; + const int64_t kMaxBackoffMs = 10000; + int64_t shift = std::max(0U, std::min(item->mTryCnt - 2, 7U)); + int64_t backoffMs = std::min(kInitialBackoffMs * (1 << shift), kMaxBackoffMs); + auto now = chrono::system_clock::now(); + item->mQuickFailNextRetryTime = now + chrono::milliseconds(backoffMs); + LOG_DEBUG(sLogger, + ("failed to build grpc request", "retry later")("item address", item)( + "config-flusher-dst", QueueKeyManager::GetInstance()->GetName(item->mQueueKey))( + "errMsg", errMsg)("tryCnt", item->mTryCnt)("backoffMs", backoffMs)); + SenderQueueManager::GetInstance()->DecreaseConcurrencyLimiterInSendingCnt(item->mQueueKey); + } else { + LOG_WARNING( + sLogger, + ("failed to build grpc request", "discard item")("item address", item)( + "config-flusher-dst", QueueKeyManager::GetInstance()->GetName(item->mQueueKey))("errMsg", errMsg)); + SenderQueueManager::GetInstance()->DecreaseConcurrencyLimiterInSendingCnt(item->mQueueKey); + SenderQueueManager::GetInstance()->RemoveItem(item->mQueueKey, item); + } + return false; + } + + LOG_TRACE(sLogger, + ("send item to grpc sink, item address", item)("config-flusher-dst", + QueueKeyManager::GetInstance()->GetName(item->mQueueKey))); + GrpcSink::GetInstance()->AddRequest(std::move(ctx)); + return true; +} +#endif + bool FlusherRunner::PushToHttpSink(SenderQueueItem* item, bool withLimit) { // TODO: use semaphore instead while (withLimit && !Application::GetInstance()->IsExiting() @@ -228,6 +279,13 @@ bool FlusherRunner::Dispatch(SenderQueueItem* item) { } else { return PushToHttpSink(item); } +#if defined(__linux__) && !defined(__ANDROID__) + case SinkType::GRPC: + // TODO(TomYu): add a shutdown bypass like flusher_sls (DiskBufferWriter) so that + // GRPC flusher doesn't block FlusherRunner::Stop() (60s timeout) when + // SenderQueue has a large backlog during process exit. + return PushToGrpcSink(item); +#endif default: SenderQueueManager::GetInstance()->RemoveItem(item->mQueueKey, item); return false; diff --git a/core/runner/FlusherRunner.h b/core/runner/FlusherRunner.h index 81e2657fc8..648f391a55 100644 --- a/core/runner/FlusherRunner.h +++ b/core/runner/FlusherRunner.h @@ -45,6 +45,9 @@ class FlusherRunner { // TODO: should be private bool PushToHttpSink(SenderQueueItem* item, bool withLimit = true); +#if defined(__linux__) && !defined(__ANDROID__) + bool PushToGrpcSink(SenderQueueItem* item, bool withLimit = true); +#endif int32_t GetSendingBufferCount() { return mHttpSendingCnt.load(); } diff --git a/core/runner/ProcessorRunner.cpp b/core/runner/ProcessorRunner.cpp index 7c5baae77a..3855d598a6 100644 --- a/core/runner/ProcessorRunner.cpp +++ b/core/runner/ProcessorRunner.cpp @@ -142,6 +142,11 @@ void ProcessorRunner::Run(uint32_t threadNo) { // there are multiple inputs pipeline->Process(eventGroupList, item->mInputIndex); + LOG_INFO(sLogger, + ("ProcessorRunner after Process", "debug")("config", configName)("eventCount", + eventGroupList[0].GetEvents().size())( + "isGoPipeline", pipeline->IsFlushingThroughGoPipeline())("isLog", isLog)); + if (pipeline->IsFlushingThroughGoPipeline()) { // TODO: // 1. allow all event types to be sent to Go pipelines diff --git a/core/runner/sink/SinkType.h b/core/runner/sink/SinkType.h index b6ce4f74ff..475507f284 100644 --- a/core/runner/sink/SinkType.h +++ b/core/runner/sink/SinkType.h @@ -18,6 +18,6 @@ namespace logtail { -enum class SinkType { HTTP, NONE }; +enum class SinkType { HTTP, GRPC, NONE }; } // namespace logtail diff --git a/core/runner/sink/grpc/GrpcSink.cpp b/core/runner/sink/grpc/GrpcSink.cpp new file mode 100644 index 0000000000..43c5869234 --- /dev/null +++ b/core/runner/sink/grpc/GrpcSink.cpp @@ -0,0 +1,155 @@ +// Copyright 2025 iLogtail Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "runner/sink/grpc/GrpcSink.h" + +#include "collection_pipeline/queue/SenderQueueItem.h" +#include "common/Flags.h" +#include "logger/Logger.h" +#include "monitor/metric_constants/MetricConstants.h" +#include "plugin/flusher/opentelemetry/FlusherOTLPNative.h" +#ifdef APSARA_UNIT_TEST_MAIN +#include "unittest/pipeline/GrpcSinkMock.h" +#endif + +DEFINE_FLAG_INT32(grpc_sink_exit_timeout_sec, "grpc sink exit timeout, seconds", 5); + +using namespace std; + +namespace logtail { + +GrpcSink* GrpcSink::GetInstance() { +#ifndef APSARA_UNIT_TEST_MAIN + static GrpcSink instance; + return &instance; +#else + return GrpcSinkMock::GetInstance(); +#endif +} + +bool GrpcSink::Init() { + WriteMetrics::GetInstance()->CreateMetricsRecordRef( + mMetricsRecordRef, + MetricCategory::METRIC_CATEGORY_RUNNER, + {{METRIC_LABEL_KEY_RUNNER_NAME, METRIC_LABEL_VALUE_RUNNER_NAME_GRPC_SINK}}); + mInItemsTotal = mMetricsRecordRef.CreateCounter(METRIC_RUNNER_IN_ITEMS_TOTAL); + mLastRunTime = mMetricsRecordRef.CreateIntGauge(METRIC_RUNNER_LAST_RUN_TIME); + mOutSuccessfulItemsTotal = mMetricsRecordRef.CreateCounter(METRIC_RUNNER_SINK_OUT_SUCCESSFUL_ITEMS_TOTAL); + mOutFailedItemsTotal = mMetricsRecordRef.CreateCounter(METRIC_RUNNER_SINK_OUT_FAILED_ITEMS_TOTAL); + mSendingItemsTotal = mMetricsRecordRef.CreateIntGauge(METRIC_RUNNER_SINK_SENDING_ITEMS_TOTAL); + WriteMetrics::GetInstance()->CommitMetricsRecordRef(mMetricsRecordRef); + + mThreadRes = async(launch::async, &GrpcSink::Run, this); + return true; +} + +void GrpcSink::Stop() { + mIsFlush.store(true); + mCV.notify_all(); + if (!mThreadRes.valid()) { + return; + } + future_status s = mThreadRes.wait_for(chrono::seconds(INT32_FLAG(grpc_sink_exit_timeout_sec))); + if (s == future_status::ready) { + LOG_INFO(sLogger, ("grpc sink", "stopped successfully")); + } else { + LOG_WARNING(sLogger, ("grpc sink", "forced to stopped")); + } +} + +void GrpcSink::AddRequest(unique_ptr&& ctx) { + { + lock_guard lock(mPendingMutex); + mPendingRequests.push_back(std::move(ctx)); + } + mCV.notify_one(); +} + +void GrpcSink::Run() { + LOG_INFO(sLogger, ("grpc sink", "started")); + while (true) { + SET_GAUGE(mLastRunTime, + chrono::duration_cast(chrono::system_clock::now().time_since_epoch()).count()); + + DispatchRequests(); + + if (mIsFlush.load()) { + unique_lock lock(mPendingMutex); + if (mPendingRequests.empty() && mSendingCnt.load() == 0) { + break; + } + mCV.wait_for(lock, chrono::milliseconds(100)); + } else { + unique_lock lock(mPendingMutex); + if (mPendingRequests.empty()) { + mCV.wait_for(lock, chrono::milliseconds(500)); + } + } + } +} + +void GrpcSink::DispatchRequests() { + vector> items; + { + lock_guard lock(mPendingMutex); + if (mPendingRequests.empty()) { + return; + } + items = std::move(mPendingRequests); + } + + for (auto& ctx : items) { + ADD_COUNTER(mInItemsTotal, 1); + ADD_GAUGE(mSendingItemsTotal, 1); + mSendingCnt.fetch_add(1); + + auto* rawCtx = ctx.release(); + auto* flusher = static_cast(rawCtx->item->mFlusher); + + auto callback = [rawCtx, flusher, this](grpc::Status status) { + auto pipelinePlaceholder = rawCtx->item->mPipeline; + // Decrement GrpcSink's in-flight counter + mSendingCnt.fetch_sub(1); + ADD_GAUGE(mSendingItemsTotal, -1); + // Let the flusher handle metrics + OnSendDone + ctx cleanup + flusher->HandleGrpcCallback(std::move(status), rawCtx); + }; + + rawCtx->context = make_unique(); + rawCtx->context->set_deadline(chrono::system_clock::now() + chrono::milliseconds(flusher->GetTimeoutMs())); + for (const auto& [k, v] : flusher->GetHeaders()) { + rawCtx->context->AddMetadata(k, v); + } + + flusher->IncInFlight(); + flusher->TrackContext(rawCtx->context.get()); + + switch (rawCtx->type) { + case OTLPGrpcCallContext::DataType::Logs: + flusher->GetLogsStub()->async()->Export( + rawCtx->context.get(), rawCtx->logsReq.get(), rawCtx->logsResp.get(), std::move(callback)); + break; + case OTLPGrpcCallContext::DataType::Metrics: + flusher->GetMetricsStub()->async()->Export( + rawCtx->context.get(), rawCtx->metricsReq.get(), rawCtx->metricsResp.get(), std::move(callback)); + break; + case OTLPGrpcCallContext::DataType::Traces: + flusher->GetTraceStub()->async()->Export( + rawCtx->context.get(), rawCtx->traceReq.get(), rawCtx->traceResp.get(), std::move(callback)); + break; + } + } +} + +} // namespace logtail diff --git a/core/runner/sink/grpc/GrpcSink.h b/core/runner/sink/grpc/GrpcSink.h new file mode 100644 index 0000000000..3d7ea4f41f --- /dev/null +++ b/core/runner/sink/grpc/GrpcSink.h @@ -0,0 +1,72 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "monitor/MetricManager.h" + +namespace logtail { + +struct OTLPGrpcCallContext; + +class GrpcSink { +public: + GrpcSink(const GrpcSink&) = delete; + GrpcSink& operator=(const GrpcSink&) = delete; + + static GrpcSink* GetInstance(); + + virtual bool Init(); + virtual void Stop(); + virtual void AddRequest(std::unique_ptr&& ctx); + + int32_t GetInFlightCount() const { return mSendingCnt.load(); } + +private: + GrpcSink() = default; + ~GrpcSink() = default; + + void Run(); + void DispatchRequests(); + + std::future mThreadRes; + std::atomic_bool mIsFlush{false}; + + std::mutex mPendingMutex; + std::condition_variable mCV; + std::vector> mPendingRequests; + + std::atomic mSendingCnt{0}; + + mutable MetricsRecordRef mMetricsRecordRef; + CounterPtr mInItemsTotal; + CounterPtr mOutSuccessfulItemsTotal; + CounterPtr mOutFailedItemsTotal; + IntGaugePtr mSendingItemsTotal; + IntGaugePtr mLastRunTime; + +#ifdef APSARA_UNIT_TEST_MAIN + friend class GrpcSinkMock; +#endif +}; + +} // namespace logtail diff --git a/core/unittest/CMakeLists.txt b/core/unittest/CMakeLists.txt index bce30c1fb0..cfc14b1c03 100644 --- a/core/unittest/CMakeLists.txt +++ b/core/unittest/CMakeLists.txt @@ -98,6 +98,14 @@ macro(ut_link ut_link_withspl) endmacro() set(SOURCE_FILES_CORE ${FRAMEWORK_SOURCE_FILES} ${PLUGIN_SOURCE_FILES_CORE}) + +# OTLP protobuf source files (must list explicitly since subdirectory globs don't find them) +list(APPEND SOURCE_FILES_CORE ${CMAKE_SOURCE_DIR}/collection_pipeline/plugin/interface/Flusher.cpp) +if (LINUX) + file(GLOB_RECURSE OTLP_PROTO_SRCS ${CMAKE_SOURCE_DIR}/protobuf/opentelemetry/*.cc) + list(APPEND SOURCE_FILES_CORE ${OTLP_PROTO_SRCS} ${CMAKE_SOURCE_DIR}/forward/otlp/OTLPForwardService.cpp) +endif() + set(SOURCE_FILES_CORE_WITHSPL ${SOURCE_FILES_CORE} ${PLUGIN_SOURCE_FILES_SPL}) # add provider diff --git a/core/unittest/flusher/CMakeLists.txt b/core/unittest/flusher/CMakeLists.txt index f4ab4af838..97fa2c04a9 100644 --- a/core/unittest/flusher/CMakeLists.txt +++ b/core/unittest/flusher/CMakeLists.txt @@ -21,6 +21,14 @@ if (ENABLE_ENTERPRISE) endif () target_link_libraries(flusher_sls_unittest ${UT_BASE_TARGET}) +if(LINUX) + add_executable(flusher_otlp_native_unittest FlusherOTLPNativeUnittest.cpp) + target_link_libraries(flusher_otlp_native_unittest ${UT_BASE_TARGET}) + + add_executable(flusher_otlp_http_native_unittest FlusherOTLPHttpNativeUnittest.cpp) + target_link_libraries(flusher_otlp_http_native_unittest ${UT_BASE_TARGET}) +endif() + if(UNIX AND NOT ENABLE_ENTERPRISE) add_executable(flusher_kafka_unittest FlusherKafkaUnittest.cpp) target_link_libraries(flusher_kafka_unittest ${UT_BASE_TARGET}) @@ -51,6 +59,10 @@ endif () include(GoogleTest) gtest_discover_tests(flusher_sls_unittest) +if(LINUX) + gtest_discover_tests(flusher_otlp_native_unittest) + gtest_discover_tests(flusher_otlp_http_native_unittest) +endif() if(UNIX AND NOT ENABLE_ENTERPRISE) gtest_discover_tests(flusher_kafka_unittest) gtest_discover_tests(kafka_util_unittest) diff --git a/core/unittest/flusher/FlusherOTLPHttpNativeUnittest.cpp b/core/unittest/flusher/FlusherOTLPHttpNativeUnittest.cpp new file mode 100644 index 0000000000..0e354a8a54 --- /dev/null +++ b/core/unittest/flusher/FlusherOTLPHttpNativeUnittest.cpp @@ -0,0 +1,359 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include "collection_pipeline/CollectionPipeline.h" +#include "collection_pipeline/CollectionPipelineContext.h" +#include "common/http/HttpResponse.h" +#include "common/memory/SourceBuffer.h" +#include "models/LogEvent.h" +#include "models/MetricEvent.h" +#include "models/MetricValue.h" +#include "models/PipelineEventGroup.h" +#include "models/SpanEvent.h" +#include "plugin/flusher/opentelemetry/FlusherOTLPHttpNative.h" +#include "runner/sink/http/HttpSinkRequest.h" +#include "unittest/Unittest.h" + +using namespace std; + +namespace logtail { + +class FlusherOTLPHttpNativeUnittest : public testing::Test { +public: + void SetUp() override { + ctx.SetConfigName("test_config"); + ctx.SetPipeline(pipeline); + } + + void TearDown() override {} + + void TestInit_ValidConfig(); + void TestInit_MissingUrl(); + void TestInit_WithTLSAndHeaders(); + void TestBuildRequestLogs(); + void TestBuildRequestMetrics(); + void TestBuildRequestTraces(); + void TestBuildRequest_EmptyData(); + void TestOnSendDone_Success(); + void TestOnSendDone_Failure(); + void TestFlusherName(); + void TestInit_ProtobufFormat(); + void TestBuildRequest_ProtobufContentType(); + void TestSerialize_ProtobufBody(); + +protected: + CollectionPipeline pipeline; + CollectionPipelineContext ctx; + + std::unique_ptr CreateAndInitFlusher(const Json::Value& config) { + auto flusher = std::make_unique(); + flusher->SetContext(ctx); + flusher->CreateMetricsRecordRef(FlusherOTLPHttpNative::sName, "1"); + + Json::Value optionalGoPipeline; + if (!flusher->Init(config, optionalGoPipeline)) { + return nullptr; + } + flusher->CommitMetricsRecordRef(); + return flusher; + } + + PipelineEventGroup MakeLogGroup() { + auto group = PipelineEventGroup(std::make_shared()); + auto* logEvent = group.AddLogEvent(true); + logEvent->SetTimestamp(1748313840, 259486017); + logEvent->SetContent(std::string("content"), std::string("Test log message")); + group.SetTag(std::string("service.name"), std::string("unittest")); + return group; + } + + PipelineEventGroup MakeMetricGroup() { + auto group = PipelineEventGroup(std::make_shared()); + auto* metricEvent = group.AddMetricEvent(true); + metricEvent->SetName("test.metric.value"); + auto* pt = metricEvent->MutableValue(); + pt->mValue = 42.5; + group.SetTag(std::string("service.name"), std::string("unittest")); + return group; + } + + PipelineEventGroup MakeSpanGroup() { + auto group = PipelineEventGroup(std::make_shared()); + auto* spanEvent = group.AddSpanEvent(true); + spanEvent->SetTraceId("unittest1234567890abcdef12345678"); + spanEvent->SetSpanId("unittest12345678"); + spanEvent->SetParentSpanId("parent12345678901234"); + spanEvent->SetName("/test/api"); + spanEvent->SetKind(SpanEvent::Kind::Server); + spanEvent->SetStatus(SpanEvent::StatusCode::Ok); + spanEvent->SetStartTimeNs(1748313840259486017ULL); + spanEvent->SetEndTimeNs(1748313840259765375ULL); + spanEvent->SetTag(std::string("service.name"), std::string("unittest")); + return group; + } +}; + +void FlusherOTLPHttpNativeUnittest::TestInit_ValidConfig() { + Json::Value config; + config["Url"] = "http://localhost:4318/v1/logs"; + + auto flusher = CreateAndInitFlusher(config); + APSARA_TEST_TRUE(flusher != nullptr); + APSARA_TEST_STREQ(flusher->GetUrl().c_str(), "http://localhost:4318/v1/logs"); +} + +void FlusherOTLPHttpNativeUnittest::TestInit_MissingUrl() { + auto flusher = std::make_unique(); + flusher->SetContext(ctx); + flusher->CreateMetricsRecordRef(FlusherOTLPHttpNative::sName, "1"); + + Json::Value config; + Json::Value optionalGoPipeline; + + APSARA_TEST_FALSE(flusher->Init(config, optionalGoPipeline)); +} + +void FlusherOTLPHttpNativeUnittest::TestInit_WithTLSAndHeaders() { + Json::Value config; + config["Url"] = "https://otel.example.com/v1/traces"; + config["EnableTLS"] = true; + Json::Value headers; + headers["Authorization"] = "Bearer test-token"; + headers["X-Custom-Header"] = "custom-value"; + config["Headers"] = headers; + + auto flusher = CreateAndInitFlusher(config); + APSARA_TEST_TRUE(flusher != nullptr); +} + +void FlusherOTLPHttpNativeUnittest::TestBuildRequestLogs() { + Json::Value config; + config["Url"] = "http://localhost:4318/v1/logs"; + config["Format"] = "json"; + Json::Value headers; + headers["X-Test"] = "value"; + config["Headers"] = headers; + + auto flusher = CreateAndInitFlusher(config); + APSARA_TEST_TRUE(flusher != nullptr); + + // Build a test SenderQueueItem with serialized OTLP JSON data + std::string testData + = R"({"resource_logs":[{"resource":{"attributes":[{"key":"service.name","value":{"string_value":"unittest"}}]},"scope_logs":[{"log_records":[{"time_unix_nano":"1748313840259486017","body":{"string_value":"Test log message"}}]}]}]})"; + SenderQueueItem testItem(std::string(testData), testData.size(), flusher.get(), 0, RawDataType::EVENT_GROUP); + + std::unique_ptr req; + bool keepItem = false; + std::string errMsg; + APSARA_TEST_TRUE(flusher->BuildRequest(&testItem, req, &keepItem, &errMsg)); + APSARA_TEST_TRUE(req != nullptr); + APSARA_TEST_TRUE(keepItem); + + // Verify request properties + APSARA_TEST_STREQ(req->mMethod.c_str(), "POST"); + APSARA_TEST_STREQ(req->mHost.c_str(), "localhost"); + APSARA_TEST_EQUAL(req->mPort, 4318); + APSARA_TEST_STREQ(req->mUrl.c_str(), "/v1/logs"); + + // Verify headers + auto it = req->mHeader.find("Content-Type"); + APSARA_TEST_TRUE(it != req->mHeader.end()); + APSARA_TEST_STREQ(it->second.c_str(), "application/json"); + + it = req->mHeader.find("X-Test"); + APSARA_TEST_TRUE(it != req->mHeader.end()); + APSARA_TEST_STREQ(it->second.c_str(), "value"); + + // Verify body contains OTLP JSON structure + APSARA_TEST_TRUE(req->mBody.find("resource_logs") != std::string::npos); + APSARA_TEST_TRUE(req->mBody.find("Test log message") != std::string::npos); +} + +void FlusherOTLPHttpNativeUnittest::TestBuildRequestMetrics() { + Json::Value config; + config["Url"] = "http://localhost:4318/v1/metrics"; + + auto flusher = CreateAndInitFlusher(config); + APSARA_TEST_TRUE(flusher != nullptr); + + std::string testData + = R"({"resource_metrics":[{"resource":{"attributes":[{"key":"service.name","value":{"string_value":"unittest"}}]},"scope_metrics":[{"metrics":[{"name":"test.metric.value","gauge":{"data_points":[{"as_double":42.5}]}}]}]}]})"; + SenderQueueItem testItem(std::string(testData), testData.size(), flusher.get(), 0, RawDataType::EVENT_GROUP); + + std::unique_ptr req; + bool keepItem = false; + std::string errMsg; + APSARA_TEST_TRUE(flusher->BuildRequest(&testItem, req, &keepItem, &errMsg)); + APSARA_TEST_TRUE(req != nullptr); + + APSARA_TEST_STREQ(req->mHost.c_str(), "localhost"); + APSARA_TEST_EQUAL(req->mPort, 4318); + APSARA_TEST_STREQ(req->mUrl.c_str(), "/v1/metrics"); + APSARA_TEST_TRUE(req->mBody.find("resource_metrics") != std::string::npos); + APSARA_TEST_TRUE(req->mBody.find("test.metric.value") != std::string::npos); + APSARA_TEST_TRUE(req->mBody.find("42.5") != std::string::npos); +} + +void FlusherOTLPHttpNativeUnittest::TestBuildRequestTraces() { + Json::Value config; + config["Url"] = "http://localhost:4318/v1/traces"; + + auto flusher = CreateAndInitFlusher(config); + APSARA_TEST_TRUE(flusher != nullptr); + + std::string testData + = R"({"resource_spans":[{"resource":{"attributes":[{"key":"service.name","value":{"string_value":"unittest"}}]},"scope_spans":[{"spans":[{"trace_id":"unittest1234567890abcdef12345678","span_id":"unittest12345678","name":"/test/api","kind":2,"status":{"code":1}}]}]}]})"; + SenderQueueItem testItem(std::string(testData), testData.size(), flusher.get(), 0, RawDataType::EVENT_GROUP); + + std::unique_ptr req; + bool keepItem = false; + std::string errMsg; + APSARA_TEST_TRUE(flusher->BuildRequest(&testItem, req, &keepItem, &errMsg)); + APSARA_TEST_TRUE(req != nullptr); + + APSARA_TEST_STREQ(req->mHost.c_str(), "localhost"); + APSARA_TEST_EQUAL(req->mPort, 4318); + APSARA_TEST_STREQ(req->mUrl.c_str(), "/v1/traces"); + APSARA_TEST_TRUE(req->mBody.find("resource_spans") != std::string::npos); + APSARA_TEST_TRUE(req->mBody.find("/test/api") != std::string::npos); +} + +void FlusherOTLPHttpNativeUnittest::TestBuildRequest_EmptyData() { + Json::Value config; + config["Url"] = "http://localhost:4318/v1/logs"; + + auto flusher = CreateAndInitFlusher(config); + APSARA_TEST_TRUE(flusher != nullptr); + + std::string emptyData; + SenderQueueItem testItem(std::string(emptyData), 0, flusher.get(), 0, RawDataType::EVENT_GROUP); + + std::unique_ptr req; + bool keepItem = true; + std::string errMsg; + APSARA_TEST_TRUE(flusher->BuildRequest(&testItem, req, &keepItem, &errMsg)); + APSARA_TEST_FALSE(keepItem); // empty data should be discarded +} + +void FlusherOTLPHttpNativeUnittest::TestOnSendDone_Success() { + Json::Value config; + config["Url"] = "http://localhost:4318/v1/logs"; + + auto flusher = CreateAndInitFlusher(config); + APSARA_TEST_TRUE(flusher != nullptr); + + std::string testData = R"({"resource_logs":[]})"; + SenderQueueItem testItem(std::move(testData), testData.size(), flusher.get(), 0, RawDataType::EVENT_GROUP); + + // Simulate successful HTTP response (200) + HttpResponse response; + response.SetStatusCode(200); + + flusher->OnSendDone(response, &testItem); + APSARA_TEST_EQUAL(1U, flusher->mSendSuccessCnt->GetValue()); +} + +void FlusherOTLPHttpNativeUnittest::TestOnSendDone_Failure() { + Json::Value config; + config["Url"] = "http://localhost:4318/v1/logs"; + + auto flusher = CreateAndInitFlusher(config); + APSARA_TEST_TRUE(flusher != nullptr); + + std::string testData = R"({"resource_logs":[]})"; + SenderQueueItem testItem(std::move(testData), testData.size(), flusher.get(), 0, RawDataType::EVENT_GROUP); + + // Simulate server error (503) + HttpResponse response; + response.SetStatusCode(503); + + flusher->OnSendDone(response, &testItem); + APSARA_TEST_EQUAL(1U, flusher->mSendFailCnt->GetValue()); +} + +void FlusherOTLPHttpNativeUnittest::TestFlusherName() { + APSARA_TEST_STREQ(FlusherOTLPHttpNative::sName.c_str(), "flusher_otlp_http_native"); +} + +void FlusherOTLPHttpNativeUnittest::TestInit_ProtobufFormat() { + Json::Value config; + config["Url"] = "http://localhost:4318/v1/logs"; + config["Format"] = "protobuf"; + + auto flusher = CreateAndInitFlusher(config); + APSARA_TEST_TRUE(flusher != nullptr); + APSARA_TEST_TRUE(flusher->GetFormat() == OTLPHttpFormat::Protobuf); +} + +void FlusherOTLPHttpNativeUnittest::TestBuildRequest_ProtobufContentType() { + Json::Value config; + config["Url"] = "http://localhost:4318/v1/logs"; + config["Format"] = "protobuf"; + + auto flusher = CreateAndInitFlusher(config); + APSARA_TEST_TRUE(flusher != nullptr); + + std::string testData = "binary-protobuf-data"; + SenderQueueItem testItem(std::string(testData), testData.size(), flusher.get(), 0, RawDataType::EVENT_GROUP); + + std::unique_ptr req; + bool keepItem = false; + std::string errMsg; + APSARA_TEST_TRUE(flusher->BuildRequest(&testItem, req, &keepItem, &errMsg)); + APSARA_TEST_TRUE(req != nullptr); + + auto it = req->mHeader.find("Content-Type"); + APSARA_TEST_TRUE(it != req->mHeader.end()); + APSARA_TEST_STREQ(it->second.c_str(), "application/x-protobuf"); +} + +void FlusherOTLPHttpNativeUnittest::TestSerialize_ProtobufBody() { + Json::Value config; + config["Url"] = "http://localhost:4318/v1/logs"; + config["Format"] = "protobuf"; + + auto flusher = CreateAndInitFlusher(config); + APSARA_TEST_TRUE(flusher != nullptr); + + auto group = MakeLogGroup(); + // Call Send which should trigger protobuf serialization + APSARA_TEST_TRUE(flusher->Send(std::move(group))); + APSARA_TEST_EQUAL(1U, flusher->mSendCnt->GetValue()); +} + +UNIT_TEST_CASE(FlusherOTLPHttpNativeUnittest, TestInit_ValidConfig) +UNIT_TEST_CASE(FlusherOTLPHttpNativeUnittest, TestInit_MissingUrl) +UNIT_TEST_CASE(FlusherOTLPHttpNativeUnittest, TestInit_WithTLSAndHeaders) +UNIT_TEST_CASE(FlusherOTLPHttpNativeUnittest, TestBuildRequestLogs) +UNIT_TEST_CASE(FlusherOTLPHttpNativeUnittest, TestBuildRequestMetrics) +UNIT_TEST_CASE(FlusherOTLPHttpNativeUnittest, TestBuildRequestTraces) +UNIT_TEST_CASE(FlusherOTLPHttpNativeUnittest, TestBuildRequest_EmptyData) +UNIT_TEST_CASE(FlusherOTLPHttpNativeUnittest, TestOnSendDone_Success) +UNIT_TEST_CASE(FlusherOTLPHttpNativeUnittest, TestOnSendDone_Failure) +UNIT_TEST_CASE(FlusherOTLPHttpNativeUnittest, TestFlusherName) +UNIT_TEST_CASE(FlusherOTLPHttpNativeUnittest, TestInit_ProtobufFormat) +UNIT_TEST_CASE(FlusherOTLPHttpNativeUnittest, TestBuildRequest_ProtobufContentType) +UNIT_TEST_CASE(FlusherOTLPHttpNativeUnittest, TestSerialize_ProtobufBody) + +} // namespace logtail + +UNIT_TEST_MAIN diff --git a/core/unittest/flusher/FlusherOTLPNativeUnittest.cpp b/core/unittest/flusher/FlusherOTLPNativeUnittest.cpp new file mode 100644 index 0000000000..03182f8507 --- /dev/null +++ b/core/unittest/flusher/FlusherOTLPNativeUnittest.cpp @@ -0,0 +1,567 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include "collection_pipeline/CollectionPipeline.h" +#include "collection_pipeline/CollectionPipelineContext.h" +#include "common/memory/SourceBuffer.h" +#include "models/LogEvent.h" +#include "models/MetricEvent.h" +#include "models/MetricValue.h" +#include "models/PipelineEventGroup.h" +#include "models/RawEvent.h" +#include "models/SpanEvent.h" +#include "plugin/flusher/opentelemetry/FlusherOTLPNative.h" +#include "protobuf/opentelemetry/proto/collector/logs/v1/logs_service.grpc.pb.h" +#include "protobuf/opentelemetry/proto/collector/metrics/v1/metrics_service.grpc.pb.h" +#include "protobuf/opentelemetry/proto/collector/trace/v1/trace_service.grpc.pb.h" +#include "unittest/Unittest.h" + +using namespace std; +using namespace opentelemetry::proto::collector::logs::v1; +using namespace opentelemetry::proto::collector::metrics::v1; +using namespace opentelemetry::proto::collector::trace::v1; + +namespace logtail { + +// ==================== Mock gRPC Collector Service ==================== + +class MockOTLPCollectorService : public LogsService::CallbackService, + public MetricsService::CallbackService, + public TraceService::CallbackService { +public: + grpc::ServerUnaryReactor* Export(grpc::CallbackServerContext* context, + const ExportLogsServiceRequest* request, + ExportLogsServiceResponse* response) override { + auto* reactor = context->DefaultReactor(); + if (request && !request->resource_logs().empty()) { + lock_guard lock(mMutex); + mLastLogRequest = make_unique(*request); + mLogCallCount++; + } + reactor->Finish(mLogStatus); + return reactor; + } + + grpc::ServerUnaryReactor* Export(grpc::CallbackServerContext* context, + const ExportMetricsServiceRequest* request, + ExportMetricsServiceResponse* response) override { + auto* reactor = context->DefaultReactor(); + if (request && !request->resource_metrics().empty()) { + lock_guard lock(mMutex); + mLastMetricRequest = make_unique(*request); + mMetricCallCount++; + } + reactor->Finish(mMetricStatus); + return reactor; + } + + grpc::ServerUnaryReactor* Export(grpc::CallbackServerContext* context, + const ExportTraceServiceRequest* request, + ExportTraceServiceResponse* response) override { + auto* reactor = context->DefaultReactor(); + if (request && !request->resource_spans().empty()) { + lock_guard lock(mMutex); + mLastTraceRequest = make_unique(*request); + mTraceCallCount++; + } + reactor->Finish(mTraceStatus); + return reactor; + } + + void SetLogStatus(const grpc::Status& s) { mLogStatus = s; } + void SetMetricStatus(const grpc::Status& s) { mMetricStatus = s; } + void SetTraceStatus(const grpc::Status& s) { mTraceStatus = s; } + void Reset() { + lock_guard lock(mMutex); + mLastLogRequest.reset(); + mLastMetricRequest.reset(); + mLastTraceRequest.reset(); + mLogCallCount = 0; + mMetricCallCount = 0; + mTraceCallCount = 0; + mLogStatus = grpc::Status::OK; + mMetricStatus = grpc::Status::OK; + mTraceStatus = grpc::Status::OK; + } + + int GetLogCallCount() const { return mLogCallCount; } + int GetMetricCallCount() const { return mMetricCallCount; } + int GetTraceCallCount() const { return mTraceCallCount; } + const ExportLogsServiceRequest* GetLastLogRequest() const { return mLastLogRequest.get(); } + const ExportMetricsServiceRequest* GetLastMetricRequest() const { return mLastMetricRequest.get(); } + const ExportTraceServiceRequest* GetLastTraceRequest() const { return mLastTraceRequest.get(); } + +private: + mutable mutex mMutex; + unique_ptr mLastLogRequest; + unique_ptr mLastMetricRequest; + unique_ptr mLastTraceRequest; + int mLogCallCount = 0; + int mMetricCallCount = 0; + int mTraceCallCount = 0; + grpc::Status mLogStatus = grpc::Status::OK; + grpc::Status mMetricStatus = grpc::Status::OK; + grpc::Status mTraceStatus = grpc::Status::OK; +}; + +// ==================== Test Fixture ==================== + +class FlusherOTLPNativeUnittest : public testing::Test { +public: + void SetUp() override { + mServerAddress = "0.0.0.0:15900"; + mMockService = make_unique(); + StartServer(); + ctx.SetConfigName("test_config"); + ctx.SetPipeline(pipeline); + } + + void TearDown() override { + if (mServer) { + mServer->Shutdown(chrono::system_clock::now() + chrono::seconds(1)); + mServer->Wait(); + mServerThread.join(); + } + } + + void TestInit_ValidConfig(); + void TestInit_MissingEndpoint(); + void TestInit_CustomHeaders(); + void TestSerializeLogs(); + void TestSerializeMetrics(); + void TestSerializeTraces(); + void TestSerializeRawEvents(); + void TestBuildGrpcRequest_Logs(); + void TestBuildGrpcRequest_Metrics(); + void TestBuildGrpcRequest_Traces(); + void TestHandleGrpcCallback_Success(); + void TestHandleGrpcCallback_Failure(); + void TestSend_ReturnsTrueForEmpty(); + void TestFlusherName(); + void TestSinkType(); + +protected: + string mServerAddress; + unique_ptr mServer; + thread mServerThread; + unique_ptr mMockService; + + CollectionPipeline pipeline; + CollectionPipelineContext ctx; + + void StartServer() { + grpc::ServerBuilder builder; + builder.AddListeningPort(mServerAddress, grpc::InsecureServerCredentials()); + builder.RegisterService( + static_cast(static_cast(mMockService.get()))); + builder.RegisterService( + static_cast(static_cast(mMockService.get()))); + builder.RegisterService( + static_cast(static_cast(mMockService.get()))); + mServer = builder.BuildAndStart(); + ASSERT_NE(mServer, nullptr); + mServerThread = thread([this]() { mServer->Wait(); }); + this_thread::sleep_for(chrono::milliseconds(500)); + } + + unique_ptr CreateAndInitFlusher() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + flusher->CreateMetricsRecordRef(FlusherOTLPNative::sName, "1"); + + Json::Value config; + config["Endpoint"] = mServerAddress; + config["TimeoutMs"] = 5000; + config["EnableTLS"] = false; + Json::Value::Members headers; + config["Headers"]["X-Custom-Header"] = "test-value"; + Json::Value optionalGoPipeline; + + if (!flusher->Init(config, optionalGoPipeline)) { + return nullptr; + } + flusher->CommitMetricsRecordRef(); + + if (!flusher->Start()) { + return nullptr; + } + return flusher; + } + + PipelineEventGroup MakeLogGroup() { + auto group = PipelineEventGroup(make_shared()); + auto* logEvent = group.AddLogEvent(true); + logEvent->SetTimestamp(1748313840, 259486017); + logEvent->SetContent(string("content"), string("Test log message")); + logEvent->SetContent(string("key1"), string("value1")); + group.SetTag(string("service.name"), string("unittest")); + return group; + } + + PipelineEventGroup MakeMetricGroup() { + auto group = PipelineEventGroup(make_shared()); + auto* metricEvent = group.AddMetricEvent(true); + metricEvent->SetName("test.metric.value"); + metricEvent->SetValue(42.5); + metricEvent->SetTag(string("host"), string("test-host")); + group.SetTag(string("service.name"), string("unittest")); + return group; + } + + PipelineEventGroup MakeSpanGroup() { + auto group = PipelineEventGroup(make_shared()); + auto* spanEvent = group.AddSpanEvent(true); + spanEvent->SetTraceId("unittest1234567890abcdef12345678"); + spanEvent->SetSpanId("unittest12345678"); + spanEvent->SetParentSpanId("parent12345678901234"); + spanEvent->SetName("/test/api"); + spanEvent->SetKind(SpanEvent::Kind::Server); + spanEvent->SetStatus(SpanEvent::StatusCode::Ok); + spanEvent->SetStartTimeNs(1748313840259486017ULL); + spanEvent->SetEndTimeNs(1748313840259765375ULL); + spanEvent->SetTag(string("service.name"), string("unittest")); + return group; + } + + PipelineEventGroup MakeRawEventGroup() { + auto group = PipelineEventGroup(make_shared()); + auto* rawEvent = group.AddRawEvent(true); + rawEvent->SetTimestamp(1748313840, 259486017); + rawEvent->SetContent(string("raw log content")); + group.SetTag(string("service.name"), string("unittest")); + return group; + } +}; + +// ==================== Init Tests ==================== + +void FlusherOTLPNativeUnittest::TestInit_ValidConfig() { + auto flusher = CreateAndInitFlusher(); + APSARA_TEST_TRUE(flusher != nullptr); + APSARA_TEST_STREQ(FlusherOTLPNative::sName.c_str(), "flusher_otlp_native"); +} + +void FlusherOTLPNativeUnittest::TestInit_MissingEndpoint() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + flusher->CreateMetricsRecordRef(FlusherOTLPNative::sName, "1"); + + Json::Value config; + config["TimeoutMs"] = 5000; + Json::Value optionalGoPipeline; + + APSARA_TEST_FALSE(flusher->Init(config, optionalGoPipeline)); +} + +void FlusherOTLPNativeUnittest::TestInit_CustomHeaders() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + flusher->CreateMetricsRecordRef(FlusherOTLPNative::sName, "1"); + + Json::Value config; + config["Endpoint"] = mServerAddress; + config["Headers"]["Authorization"] = "Bearer token123"; + config["Headers"]["X-Request-Id"] = "req-001"; + Json::Value optionalGoPipeline; + + APSARA_TEST_TRUE(flusher->Init(config, optionalGoPipeline)); + APSARA_TEST_EQUAL(flusher->GetHeaders().size(), 2); + APSARA_TEST_STREQ(flusher->GetHeaders().at("Authorization").c_str(), "Bearer token123"); + APSARA_TEST_TRUE(flusher->GetTimeoutMs() == 30000); // default +} + +// ==================== Serialization Tests ==================== + +void FlusherOTLPNativeUnittest::TestSerializeLogs() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + + auto group = MakeLogGroup(); + string data; + string errMsg; + APSARA_TEST_TRUE(flusher->SerializeLogsToOTLP(group, data, errMsg)); + APSARA_TEST_TRUE(data.size() > 0); + + // Verify by deserializing + ExportLogsServiceRequest req; + APSARA_TEST_TRUE(req.ParseFromString(data)); + APSARA_TEST_EQUAL(req.resource_logs_size(), 1); + + const auto& scopeLogs = req.resource_logs(0).scope_logs(0); + APSARA_TEST_EQUAL(scopeLogs.log_records_size(), 1); + const auto& record = scopeLogs.log_records(0); + APSARA_TEST_STREQ(record.body().string_value().c_str(), "Test log message"); + APSARA_TEST_TRUE(record.time_unix_nano() > 0); + // Check attributes (key1, message) + bool foundKey1 = false; + for (int i = 0; i < record.attributes_size(); i++) { + if (record.attributes(i).key() == "key1") { + foundKey1 = true; + APSARA_TEST_STREQ(record.attributes(i).value().string_value().c_str(), "value1"); + } + } + APSARA_TEST_TRUE(foundKey1); +} + +void FlusherOTLPNativeUnittest::TestSerializeMetrics() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + + auto group = MakeMetricGroup(); + string data; + string errMsg; + APSARA_TEST_TRUE(flusher->SerializeMetricsToOTLP(group, data, errMsg)); + APSARA_TEST_TRUE(data.size() > 0); + + ExportMetricsServiceRequest req; + APSARA_TEST_TRUE(req.ParseFromString(data)); + APSARA_TEST_EQUAL(req.resource_metrics_size(), 1); + + const auto& metric = req.resource_metrics(0).scope_metrics(0).metrics(0); + APSARA_TEST_STREQ(metric.name().c_str(), "test.metric.value"); + APSARA_TEST_TRUE(metric.has_gauge()); + APSARA_TEST_EQUAL(metric.gauge().data_points_size(), 1); + APSARA_TEST_EQUAL(metric.gauge().data_points(0).as_double(), 42.5); +} + +void FlusherOTLPNativeUnittest::TestSerializeTraces() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + + auto group = MakeSpanGroup(); + string data; + string errMsg; + APSARA_TEST_TRUE(flusher->SerializeTracesToOTLP(group, data, errMsg)); + APSARA_TEST_TRUE(data.size() > 0); + + ExportTraceServiceRequest req; + APSARA_TEST_TRUE(req.ParseFromString(data)); + APSARA_TEST_EQUAL(req.resource_spans_size(), 1); + + const auto& span = req.resource_spans(0).scope_spans(0).spans(0); + APSARA_TEST_STREQ(span.name().c_str(), "/test/api"); + APSARA_TEST_EQUAL(span.kind(), opentelemetry::proto::trace::v1::Span_SpanKind_SPAN_KIND_SERVER); + APSARA_TEST_EQUAL(span.status().code(), opentelemetry::proto::trace::v1::Status_StatusCode_STATUS_CODE_OK); + APSARA_TEST_TRUE(span.start_time_unix_nano() > 0); + APSARA_TEST_TRUE(span.end_time_unix_nano() > 0); +} + +void FlusherOTLPNativeUnittest::TestSerializeRawEvents() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + + auto group = MakeRawEventGroup(); + string data; + string errMsg; + APSARA_TEST_TRUE(flusher->SerializeLogsToOTLP(group, data, errMsg)); + APSARA_TEST_TRUE(data.size() > 0); + + ExportLogsServiceRequest req; + APSARA_TEST_TRUE(req.ParseFromString(data)); + APSARA_TEST_EQUAL(req.resource_logs_size(), 1); + + const auto& record = req.resource_logs(0).scope_logs(0).log_records(0); + APSARA_TEST_STREQ(record.body().string_value().c_str(), "raw log content"); + APSARA_TEST_TRUE(record.time_unix_nano() > 0); +} + +// ==================== BuildGrpcRequest Tests ==================== + +void FlusherOTLPNativeUnittest::TestBuildGrpcRequest_Logs() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + + auto group = MakeLogGroup(); + string data; + string errMsg; + APSARA_TEST_TRUE(flusher->SerializeLogsToOTLP(group, data, errMsg)); + + auto item = make_unique( + move(data), data.size(), flusher.get(), 1, OTLPGrpcCallContext::DataType::Logs); + + unique_ptr ctx2; + bool keepItem = false; + string buildErr; + APSARA_TEST_TRUE(flusher->BuildGrpcRequest(item.get(), ctx2, &keepItem, &buildErr)); + APSARA_TEST_TRUE(ctx2 != nullptr); + APSARA_TEST_TRUE(ctx2->logsReq != nullptr); + APSARA_TEST_EQUAL(ctx2->logsReq->resource_logs_size(), 1); + APSARA_TEST_FALSE(keepItem); +} + +void FlusherOTLPNativeUnittest::TestBuildGrpcRequest_Metrics() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + + auto group = MakeMetricGroup(); + string data; + string errMsg; + APSARA_TEST_TRUE(flusher->SerializeMetricsToOTLP(group, data, errMsg)); + + auto item = make_unique( + move(data), data.size(), flusher.get(), 1, OTLPGrpcCallContext::DataType::Metrics); + + unique_ptr ctx2; + bool keepItem = false; + string buildErr; + APSARA_TEST_TRUE(flusher->BuildGrpcRequest(item.get(), ctx2, &keepItem, &buildErr)); + APSARA_TEST_TRUE(ctx2 != nullptr); + APSARA_TEST_TRUE(ctx2->metricsReq != nullptr); + APSARA_TEST_EQUAL(ctx2->metricsReq->resource_metrics_size(), 1); +} + +void FlusherOTLPNativeUnittest::TestBuildGrpcRequest_Traces() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + + auto group = MakeSpanGroup(); + string data; + string errMsg; + APSARA_TEST_TRUE(flusher->SerializeTracesToOTLP(group, data, errMsg)); + + auto item = make_unique( + move(data), data.size(), flusher.get(), 1, OTLPGrpcCallContext::DataType::Traces); + + unique_ptr ctx2; + bool keepItem = false; + string buildErr; + APSARA_TEST_TRUE(flusher->BuildGrpcRequest(item.get(), ctx2, &keepItem, &buildErr)); + APSARA_TEST_TRUE(ctx2 != nullptr); + APSARA_TEST_TRUE(ctx2->traceReq != nullptr); + APSARA_TEST_EQUAL(ctx2->traceReq->resource_spans_size(), 1); +} + +// ==================== HandleGrpcCallback Tests ==================== + +void FlusherOTLPNativeUnittest::TestHandleGrpcCallback_Success() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + flusher->CreateMetricsRecordRef(FlusherOTLPNative::sName, "1"); + flusher->mSendSuccessCnt = flusher->GetMetricsRecordRef().CreateCounter("mSendSuccessCnt"); + flusher->CommitMetricsRecordRef(); + + string data = "test-data"; + auto item = make_unique( + move(data), data.size(), flusher.get(), 1, OTLPGrpcCallContext::DataType::Logs); + SenderQueueItem* rawItem = item.get(); + + auto* ctx2 = new OTLPGrpcCallContext(); + ctx2->item = item.release(); + ctx2->type = OTLPGrpcCallContext::DataType::Logs; + ctx2->logsReq = make_unique(); + ctx2->logsResp = make_unique(); + + flusher->IncInFlight(); + APSARA_TEST_EQUAL(flusher->InFlightCount(), 1); + + flusher->HandleGrpcCallback(grpc::Status(grpc::StatusCode::OK, ""), ctx2); + + this_thread::sleep_for(chrono::milliseconds(50)); + APSARA_TEST_EQUAL(flusher->InFlightCount(), 0); + APSARA_TEST_EQUAL(1U, flusher->mSendSuccessCnt->GetValue()); + // Clean up: DealSenderQueueItemAfterSend doesn't delete the item, relies on queue manager + delete rawItem; +} + +void FlusherOTLPNativeUnittest::TestHandleGrpcCallback_Failure() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + flusher->CreateMetricsRecordRef(FlusherOTLPNative::sName, "1"); + flusher->mSendFailCnt = flusher->GetMetricsRecordRef().CreateCounter("mSendFailCnt"); + flusher->CommitMetricsRecordRef(); + + string data = "test-data"; + auto item = make_unique( + move(data), data.size(), flusher.get(), 1, OTLPGrpcCallContext::DataType::Logs); + SenderQueueItem* rawItem = item.get(); + + auto* ctx2 = new OTLPGrpcCallContext(); + ctx2->item = item.release(); + ctx2->type = OTLPGrpcCallContext::DataType::Logs; + ctx2->logsReq = make_unique(); + ctx2->logsResp = make_unique(); + + flusher->IncInFlight(); + auto status = grpc::Status(grpc::StatusCode::UNAVAILABLE, "Server unavailable"); + flusher->HandleGrpcCallback(move(status), ctx2); + + this_thread::sleep_for(chrono::milliseconds(50)); + APSARA_TEST_EQUAL(flusher->InFlightCount(), 0); + APSARA_TEST_EQUAL(1U, flusher->mSendFailCnt->GetValue()); + delete rawItem; +} + +// ==================== Misc Tests ==================== + +void FlusherOTLPNativeUnittest::TestSend_ReturnsTrueForEmpty() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + flusher->CreateMetricsRecordRef(FlusherOTLPNative::sName, "1"); + Json::Value config; + config["Endpoint"] = "localhost:4317"; + Json::Value optionalGoPipeline; + APSARA_TEST_TRUE(flusher->Init(config, optionalGoPipeline)); + flusher->CommitMetricsRecordRef(); + + // Empty group → returns true, counter NOT incremented + PipelineEventGroup emptyGroup(make_shared()); + APSARA_TEST_TRUE(flusher->Send(move(emptyGroup))); + APSARA_TEST_EQUAL(0U, flusher->mSendCnt->GetValue()); + + // Non-empty group → returns true, counter incremented + auto group = MakeLogGroup(); + APSARA_TEST_TRUE(flusher->Send(move(group))); + APSARA_TEST_EQUAL(1U, flusher->mSendCnt->GetValue()); +} + +void FlusherOTLPNativeUnittest::TestFlusherName() { + APSARA_TEST_STREQ(FlusherOTLPNative::sName.c_str(), "flusher_otlp_native"); +} + +void FlusherOTLPNativeUnittest::TestSinkType() { + auto flusher = make_unique(); + APSARA_TEST_TRUE(flusher->GetSinkType() == SinkType::GRPC); +} + +// ==================== Test Registration ==================== + +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestInit_ValidConfig) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestInit_MissingEndpoint) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestInit_CustomHeaders) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestSerializeLogs) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestSerializeMetrics) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestSerializeTraces) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestSerializeRawEvents) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestBuildGrpcRequest_Logs) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestBuildGrpcRequest_Metrics) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestBuildGrpcRequest_Traces) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestHandleGrpcCallback_Success) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestHandleGrpcCallback_Failure) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestSend_ReturnsTrueForEmpty) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestFlusherName) +UNIT_TEST_CASE(FlusherOTLPNativeUnittest, TestSinkType) + +} // namespace logtail + +UNIT_TEST_MAIN diff --git a/core/unittest/forward/CMakeLists.txt b/core/unittest/forward/CMakeLists.txt index 150b5199eb..55604bd6f5 100644 --- a/core/unittest/forward/CMakeLists.txt +++ b/core/unittest/forward/CMakeLists.txt @@ -24,7 +24,11 @@ target_link_libraries(loongsuite_forward_service_unittest ${UT_BASE_TARGET}) # add_executable(loongsuite_grpc_client_unittest LoongSuiteGrpcClientUnittest.cpp) # target_link_libraries(loongsuite_grpc_client_unittest ${UT_BASE_TARGET}) +add_executable(otlp_forward_service_unittest OTLPForwardServiceUnittest.cpp) +target_link_libraries(otlp_forward_service_unittest ${UT_BASE_TARGET}) + include(GoogleTest) gtest_discover_tests(grpc_input_manager_unittest) gtest_discover_tests(loongsuite_forward_service_unittest) -# gtest_discover_tests(loongsuite_grpc_client_unittest) \ No newline at end of file +# gtest_discover_tests(loongsuite_grpc_client_unittest) +gtest_discover_tests(otlp_forward_service_unittest) \ No newline at end of file diff --git a/core/unittest/forward/MockServiceImpl.h b/core/unittest/forward/MockServiceImpl.h index c6ac83ecab..c18007d705 100644 --- a/core/unittest/forward/MockServiceImpl.h +++ b/core/unittest/forward/MockServiceImpl.h @@ -27,6 +27,7 @@ class MockServiceImpl : public BaseService, public LoongSuiteForwardService::Cal bool Update(std::string configName, const Json::Value& config) override { return true; } bool Remove(std::string configName, const Json::Value& config) override { return true; } [[nodiscard]] const std::string& Name() const override { return sName; } + std::vector<::grpc::Service*> GetGrpcServices() override { return {this}; } grpc::ServerUnaryReactor* Forward(grpc::CallbackServerContext* context, const LoongSuiteForwardRequest* request, diff --git a/core/unittest/forward/OTLPForwardServiceUnittest.cpp b/core/unittest/forward/OTLPForwardServiceUnittest.cpp new file mode 100644 index 0000000000..9e1b2b7902 --- /dev/null +++ b/core/unittest/forward/OTLPForwardServiceUnittest.cpp @@ -0,0 +1,383 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "json/reader.h" + +#include "collection_pipeline/queue/ProcessQueueManager.h" +#include "collection_pipeline/queue/QueueKeyManager.h" +#include "forward/otlp/OTLPForwardService.h" +#include "logger/Logger.h" +#include "protobuf/opentelemetry/proto/collector/logs/v1/logs_service.grpc.pb.h" +#include "protobuf/opentelemetry/proto/collector/metrics/v1/metrics_service.grpc.pb.h" +#include "protobuf/opentelemetry/proto/collector/trace/v1/trace_service.grpc.pb.h" +#include "unittest/Unittest.h" + +using namespace std; +using namespace opentelemetry::proto::collector::logs::v1; +using namespace opentelemetry::proto::collector::metrics::v1; +using namespace opentelemetry::proto::collector::trace::v1; + +namespace logtail { + +class OTLPForwardServiceUnittest : public testing::Test { +public: + void SetUp() override { + mServerAddress = "0.0.0.0:15899"; + // Create a dummy queue for PushQueue to work + mQueueKey = QueueKeyManager::GetInstance()->GetKey("test-queue-key"); + CollectionPipelineContext dummyCtx; + ProcessQueueManager::GetInstance()->CreateOrUpdateCountBoundedQueue(mQueueKey, 0, dummyCtx); + } + + void TearDown() override { + ProcessQueueManager::GetInstance()->Clear(); + QueueKeyManager::GetInstance()->Clear(); + if (mServer) { + mServer->Shutdown(std::chrono::system_clock::now() + std::chrono::seconds(1)); + mServer->Wait(); + mServerThread.join(); + } + } + + bool StartMockServer() { + mOtlpService = std::make_unique(mServerAddress); + + grpc::ServerBuilder builder; + builder.AddListeningPort(mServerAddress, grpc::InsecureServerCredentials()); + for (auto* svc : mOtlpService->GetGrpcServices()) { + builder.RegisterService(svc); + } + mServer = builder.BuildAndStart(); + if (!mServer) { + return false; + } + + mServerThread = std::thread([this]() { mServer->Wait(); }); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + return true; + } + + bool CreateGrpcClient() { + auto channel = grpc::CreateChannel(mServerAddress, grpc::InsecureChannelCredentials()); + mLogsStub = LogsService::NewStub(channel); + mMetricsStub = MetricsService::NewStub(channel); + mTraceStub = TraceService::NewStub(channel); + return mLogsStub != nullptr; + } + + void TestOTLPLogsForward(); + void TestOTLPLogsForwardWithConfig(); + void TestOTLPLogsForwardEmptyRequest(); + void TestOTLPMetricsForward(); + void TestOTLPMetricsForwardWithConfig(); + void TestOTLPTracesForward(); + void TestOTLPTracesForwardWithConfig(); + void TestOTLPTracesForwardEmptyRequest(); + void TestOTLPConfigUpdate(); + +protected: + std::string mServerAddress; + std::unique_ptr mServer; + std::thread mServerThread; + std::unique_ptr mOtlpService; + + std::unique_ptr mLogsStub; + std::unique_ptr mMetricsStub; + std::unique_ptr mTraceStub; + QueueKey mQueueKey; +}; + +void OTLPForwardServiceUnittest::TestOTLPLogsForward() { + ASSERT_TRUE(StartMockServer()); + ASSERT_TRUE(CreateGrpcClient()); + + ExportLogsServiceRequest request; + auto* resourceLogs = request.add_resource_logs(); + auto* scopeLogs = resourceLogs->add_scope_logs(); + + // Add resource attributes + auto* attr = resourceLogs->mutable_resource()->add_attributes(); + attr->set_key("service.name"); + attr->mutable_value()->set_string_value("unittest"); + + // Add a log record + auto* logRecord = scopeLogs->add_log_records(); + logRecord->set_time_unix_nano(1748313840259486017ULL); + logRecord->mutable_body()->set_string_value("Test log message"); + logRecord->set_severity_text("INFO"); + + grpc::ClientContext context; + context.AddMetadata("x-otlp-apm-configname", "test-config"); + ExportLogsServiceResponse response; + + // No matching config -> should return NOT_FOUND, discarded counter incremented + grpc::Status status = mLogsStub->Export(&context, request, &response); + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.error_code(), grpc::StatusCode::NOT_FOUND); + APSARA_TEST_EQUAL(1U, mOtlpService->mDiscardedEventsTotal->GetValue()); +} + +void OTLPForwardServiceUnittest::TestOTLPLogsForwardWithConfig() { + ASSERT_TRUE(StartMockServer()); + ASSERT_TRUE(CreateGrpcClient()); + + // Configure the service + Json::Value config; + config["QueueKey"] = static_cast(mQueueKey); + config["InputIndex"] = 0; + ASSERT_TRUE(mOtlpService->Update("test-config", config)); + + ExportLogsServiceRequest request; + auto* resourceLogs = request.add_resource_logs(); + auto* scopeLogs = resourceLogs->add_scope_logs(); + + auto* attr = resourceLogs->mutable_resource()->add_attributes(); + attr->set_key("service.name"); + attr->mutable_value()->set_string_value("unittest"); + + auto* logRecord = scopeLogs->add_log_records(); + logRecord->set_time_unix_nano(1748313840259486017ULL); + logRecord->mutable_body()->set_string_value("Test log message"); + logRecord->set_severity_text("INFO"); + auto* attr2 = logRecord->add_attributes(); + attr2->set_key("key1"); + attr2->mutable_value()->set_string_value("value1"); + + grpc::ClientContext context; + context.AddMetadata("x-otlp-apm-configname", "test-config"); + ExportLogsServiceResponse response; + + grpc::Status status = mLogsStub->Export(&context, request, &response); + EXPECT_TRUE(status.ok()); + APSARA_TEST_EQUAL(1U, mOtlpService->mLogInEventsTotal->GetValue()); + APSARA_TEST_TRUE(mOtlpService->mLogInSizeBytes->GetValue() > 0); +} + +void OTLPForwardServiceUnittest::TestOTLPLogsForwardEmptyRequest() { + ASSERT_TRUE(StartMockServer()); + ASSERT_TRUE(CreateGrpcClient()); + + Json::Value config; + config["QueueKey"] = static_cast(mQueueKey); + config["InputIndex"] = 0; + ASSERT_TRUE(mOtlpService->Update("test-config", config)); + + // Empty request (no log records) + ExportLogsServiceRequest request; + auto* resourceLogs = request.add_resource_logs(); + resourceLogs->add_scope_logs(); + + grpc::ClientContext context; + context.AddMetadata("x-otlp-apm-configname", "test-config"); + ExportLogsServiceResponse response; + + grpc::Status status = mLogsStub->Export(&context, request, &response); + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.error_code(), grpc::StatusCode::INVALID_ARGUMENT); + APSARA_TEST_EQUAL(1U, mOtlpService->mLogInEventsTotal->GetValue()); +} + +void OTLPForwardServiceUnittest::TestOTLPMetricsForwardWithConfig() { + ASSERT_TRUE(StartMockServer()); + ASSERT_TRUE(CreateGrpcClient()); + + Json::Value config; + config["QueueKey"] = static_cast(mQueueKey); + config["InputIndex"] = 0; + ASSERT_TRUE(mOtlpService->Update("test-config", config)); + + // Gauge metric + ExportMetricsServiceRequest request; + auto* resourceMetrics = request.add_resource_metrics(); + auto* scopeMetrics = resourceMetrics->add_scope_metrics(); + auto* metric = scopeMetrics->add_metrics(); + metric->set_name("test.gauge.metric"); + auto* gauge = metric->mutable_gauge(); + auto* dp = gauge->add_data_points(); + dp->set_as_double(42.5); + dp->set_time_unix_nano(1748313840259486017ULL); + + grpc::ClientContext context; + context.AddMetadata("x-otlp-apm-configname", "test-config"); + ExportMetricsServiceResponse response; + + grpc::Status status = mMetricsStub->Export(&context, request, &response); + EXPECT_TRUE(status.ok()); + APSARA_TEST_EQUAL(1U, mOtlpService->mMetricInEventsTotal->GetValue()); +} + +void OTLPForwardServiceUnittest::TestOTLPTracesForwardWithConfig() { + ASSERT_TRUE(StartMockServer()); + ASSERT_TRUE(CreateGrpcClient()); + + Json::Value config; + config["QueueKey"] = static_cast(mQueueKey); + config["InputIndex"] = 0; + ASSERT_TRUE(mOtlpService->Update("test-config", config)); + + ExportTraceServiceRequest request; + auto* resourceSpans = request.add_resource_spans(); + auto* scopeSpans = resourceSpans->add_scope_spans(); + + auto* span = scopeSpans->add_spans(); + span->set_trace_id("unittest1234567890abcdef12345678"); + span->set_span_id("unittest12345678"); + span->set_parent_span_id("parent12345678901234"); + span->set_name("/unittest/api"); + span->set_kind(opentelemetry::proto::trace::v1::Span::SPAN_KIND_SERVER); + span->set_start_time_unix_nano(1748313840259486017ULL); + span->set_end_time_unix_nano(1748313840259765375ULL); + span->mutable_status()->set_code(opentelemetry::proto::trace::v1::Status::STATUS_CODE_OK); + auto* attr = span->add_attributes(); + attr->set_key("http.method"); + attr->mutable_value()->set_string_value("GET"); + + grpc::ClientContext context; + context.AddMetadata("x-otlp-apm-configname", "test-config"); + ExportTraceServiceResponse response; + + grpc::Status status = mTraceStub->Export(&context, request, &response); + EXPECT_TRUE(status.ok()); + APSARA_TEST_EQUAL(1U, mOtlpService->mTraceInEventsTotal->GetValue()); +} + +void OTLPForwardServiceUnittest::TestOTLPTracesForwardEmptyRequest() { + ASSERT_TRUE(StartMockServer()); + ASSERT_TRUE(CreateGrpcClient()); + + Json::Value config; + config["QueueKey"] = static_cast(mQueueKey); + config["InputIndex"] = 0; + ASSERT_TRUE(mOtlpService->Update("test-config", config)); + + ExportTraceServiceRequest request; + auto* resourceSpans = request.add_resource_spans(); + resourceSpans->add_scope_spans(); + + grpc::ClientContext context; + context.AddMetadata("x-otlp-apm-configname", "test-config"); + ExportTraceServiceResponse response; + + grpc::Status status = mTraceStub->Export(&context, request, &response); + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.error_code(), grpc::StatusCode::INVALID_ARGUMENT); + APSARA_TEST_EQUAL(1U, mOtlpService->mTraceInEventsTotal->GetValue()); +} + +void OTLPForwardServiceUnittest::TestOTLPMetricsForward() { + ASSERT_TRUE(StartMockServer()); + ASSERT_TRUE(CreateGrpcClient()); + + ExportMetricsServiceRequest request; + auto* resourceMetrics = request.add_resource_metrics(); + auto* scopeMetrics = resourceMetrics->add_scope_metrics(); + + auto* metric = scopeMetrics->add_metrics(); + metric->set_name("test.metric"); + auto* gauge = metric->mutable_gauge(); + auto* dp = gauge->add_data_points(); + dp->set_as_double(42.0); + + grpc::ClientContext context; + context.AddMetadata("x-otlp-apm-configname", "test-config"); + ExportMetricsServiceResponse response; + + // No matching config -> should return NOT_FOUND + grpc::Status status = mMetricsStub->Export(&context, request, &response); + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.error_code(), grpc::StatusCode::NOT_FOUND); + APSARA_TEST_EQUAL(1U, mOtlpService->mDiscardedEventsTotal->GetValue()); +} + +void OTLPForwardServiceUnittest::TestOTLPTracesForward() { + ASSERT_TRUE(StartMockServer()); + ASSERT_TRUE(CreateGrpcClient()); + + ExportTraceServiceRequest request; + auto* resourceSpans = request.add_resource_spans(); + auto* scopeSpans = resourceSpans->add_scope_spans(); + + auto* span = scopeSpans->add_spans(); + span->set_trace_id("unittest1234567890abcdef12345678"); + span->set_span_id("unittest12345678"); + span->set_parent_span_id("parent12345678901234"); + span->set_name("/unittest/api"); + span->set_kind(opentelemetry::proto::trace::v1::Span::SPAN_KIND_SERVER); + span->set_start_time_unix_nano(1748313840259486017ULL); + span->set_end_time_unix_nano(1748313840259765375ULL); + span->mutable_status()->set_code(opentelemetry::proto::trace::v1::Status::STATUS_CODE_OK); + + grpc::ClientContext context; + context.AddMetadata("x-otlp-apm-configname", "test-config"); + ExportTraceServiceResponse response; + + // No matching config -> should return NOT_FOUND, discarded counter incremented + grpc::Status status = mTraceStub->Export(&context, request, &response); + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.error_code(), grpc::StatusCode::NOT_FOUND); + APSARA_TEST_EQUAL(1U, mOtlpService->mDiscardedEventsTotal->GetValue()); +} + +void OTLPForwardServiceUnittest::TestOTLPConfigUpdate() { + auto service = std::make_unique("0.0.0.0:15899"); + + // Test valid config + Json::Value validConfig; + validConfig["QueueKey"] = 1; + validConfig["InputIndex"] = 0; + EXPECT_TRUE(service->Update("config1", validConfig)); + + // Test missing QueueKey + Json::Value invalidConfig1; + invalidConfig1["InputIndex"] = 0; + EXPECT_FALSE(service->Update("config2", invalidConfig1)); + + // Test missing InputIndex + Json::Value invalidConfig2; + invalidConfig2["QueueKey"] = 1; + EXPECT_FALSE(service->Update("config3", invalidConfig2)); + + // Test remove + Json::Value removeConfig; + EXPECT_TRUE(service->Remove("config1", removeConfig)); + + EXPECT_EQ(service->Name(), "OTLPForwardService"); +} + +UNIT_TEST_CASE(OTLPForwardServiceUnittest, TestOTLPLogsForward) +UNIT_TEST_CASE(OTLPForwardServiceUnittest, TestOTLPLogsForwardWithConfig) +UNIT_TEST_CASE(OTLPForwardServiceUnittest, TestOTLPLogsForwardEmptyRequest) +UNIT_TEST_CASE(OTLPForwardServiceUnittest, TestOTLPMetricsForward) +UNIT_TEST_CASE(OTLPForwardServiceUnittest, TestOTLPMetricsForwardWithConfig) +UNIT_TEST_CASE(OTLPForwardServiceUnittest, TestOTLPTracesForward) +UNIT_TEST_CASE(OTLPForwardServiceUnittest, TestOTLPTracesForwardWithConfig) +UNIT_TEST_CASE(OTLPForwardServiceUnittest, TestOTLPTracesForwardEmptyRequest) +UNIT_TEST_CASE(OTLPForwardServiceUnittest, TestOTLPConfigUpdate) + +} // namespace logtail + +UNIT_TEST_MAIN diff --git a/core/unittest/pipeline/GrpcSinkMock.h b/core/unittest/pipeline/GrpcSinkMock.h new file mode 100644 index 0000000000..ef5488ddf8 --- /dev/null +++ b/core/unittest/pipeline/GrpcSinkMock.h @@ -0,0 +1,137 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "logger/Logger.h" +#include "plugin/flusher/opentelemetry/FlusherOTLPNative.h" +#include "runner/FlusherRunner.h" +#include "runner/sink/grpc/GrpcSink.h" + +namespace logtail { + +class GrpcSinkMock : public GrpcSink { +public: + GrpcSinkMock(const GrpcSinkMock&) = delete; + GrpcSinkMock& operator=(const GrpcSinkMock&) = delete; + + static GrpcSinkMock* GetInstance() { + static GrpcSinkMock instance; + return &instance; + } + + bool Init() override { + mIsFlush = false; + mThreadRes = std::async(std::launch::async, &GrpcSinkMock::Run, this); + return true; + } + + void Stop() override { + mIsFlush = true; + mCV.notify_all(); + if (!mThreadRes.valid()) { + return; + } + std::future_status s = mThreadRes.wait_for(std::chrono::seconds(1)); + if (s == std::future_status::ready) { + LOG_INFO(sLogger, ("grpc sink mock", "stopped successfully")); + } else { + LOG_WARNING(sLogger, ("grpc sink mock", "forced to stopped")); + } + ClearRequests(); + } + + void AddRequest(std::unique_ptr&& ctx) override { + std::lock_guard lock(mMutex); + mPendingRequests.push_back(std::move(ctx)); + mCV.notify_one(); + } + + void Run() { + LOG_INFO(sLogger, ("grpc sink mock", "started")); + while (true) { + std::vector> items; + { + std::unique_lock lock(mMutex); + if (mPendingRequests.empty()) { + if (mIsFlush) { + break; + } + mCV.wait_for(lock, std::chrono::milliseconds(100)); + continue; + } + items = std::move(mPendingRequests); + } + + for (auto& ctx : items) { + auto* rawCtx = ctx.release(); + auto* flusher = static_cast(rawCtx->item->mFlusher); + + grpc::Status status(grpc::StatusCode::OK, "OK"); + { + std::lock_guard lock(mRequestsMutex); + mRequests.push_back(*rawCtx->item); + } + + // Match real GrpcSink behavior: IncInFlight before dispatch + flusher->IncInFlight(); + + // Simulate async callback + rawCtx->context = std::make_unique(); + flusher->HandleGrpcCallback(std::move(status), rawCtx); + } + + if (mIsFlush) { + std::unique_lock lock(mMutex); + if (mPendingRequests.empty()) { + break; + } + } + } + } + + std::vector GetRequests() { + std::lock_guard lock(mRequestsMutex); + return mRequests; + } + + void ClearRequests() { + std::lock_guard lock(mRequestsMutex); + mRequests.clear(); + } + +private: + GrpcSinkMock() = default; + ~GrpcSinkMock() = default; + + std::atomic_bool mIsFlush{false}; + std::mutex mMutex; + std::condition_variable mCV; + std::vector> mPendingRequests; + + std::mutex mRequestsMutex; + std::vector mRequests; + + friend class GrpcSink; +}; + +} // namespace logtail diff --git a/core/unittest/sender/CMakeLists.txt b/core/unittest/sender/CMakeLists.txt index 9477c01be6..b19b8dbb94 100644 --- a/core/unittest/sender/CMakeLists.txt +++ b/core/unittest/sender/CMakeLists.txt @@ -18,5 +18,13 @@ project(sender_unittest) add_executable(flusher_runner_unittest FlusherRunnerUnittest.cpp) target_link_libraries(flusher_runner_unittest ${UT_BASE_TARGET}) +if(LINUX) + add_executable(flusher_runner_grpc_unittest FlusherRunnerGrpcUnittest.cpp) + target_link_libraries(flusher_runner_grpc_unittest ${UT_BASE_TARGET}) +endif() + include(GoogleTest) gtest_discover_tests(flusher_runner_unittest) +if(LINUX) + gtest_discover_tests(flusher_runner_grpc_unittest) +endif() diff --git a/core/unittest/sender/FlusherRunnerGrpcUnittest.cpp b/core/unittest/sender/FlusherRunnerGrpcUnittest.cpp new file mode 100644 index 0000000000..3ae6e44f00 --- /dev/null +++ b/core/unittest/sender/FlusherRunnerGrpcUnittest.cpp @@ -0,0 +1,152 @@ +/* + * Copyright 2025 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Integration test: PushToGrpcSink -> GrpcSink path via mock. + +#include + +#include +#include +#include +#include +#include + +#include "collection_pipeline/CollectionPipeline.h" +#include "collection_pipeline/CollectionPipelineContext.h" +#include "common/memory/SourceBuffer.h" +#include "models/LogEvent.h" +#include "models/MetricEvent.h" +#include "models/MetricValue.h" +#include "models/PipelineEventGroup.h" +#include "models/SpanEvent.h" +#include "plugin/flusher/opentelemetry/FlusherOTLPNative.h" +#include "runner/FlusherRunner.h" +#include "unittest/Unittest.h" +#include "unittest/pipeline/GrpcSinkMock.h" + +using namespace std; + +namespace logtail { + +class PushToGrpcSinkUnittest : public testing::Test { +public: + void SetUp() override { + mServerAddress = "0.0.0.0:15902"; + ctx.SetConfigName("test_config"); + ctx.SetPipeline(pipeline); + // Initialize the GrpcSinkMock + GrpcSinkMock::GetInstance()->Init(); + } + + void TearDown() override { GrpcSinkMock::GetInstance()->Stop(); } + + void TestPushToGrpcSink_Success(); + void TestPushToGrpcSink_BuildRequestFailure(); + void TestSinkTypeIsGrpc(); + +protected: + string mServerAddress; + CollectionPipeline pipeline; + CollectionPipelineContext ctx; + + unique_ptr CreateAndInitFlusher() { + auto flusher = make_unique(); + flusher->SetContext(ctx); + flusher->CreateMetricsRecordRef(FlusherOTLPNative::sName, "1"); + + Json::Value config; + config["Endpoint"] = mServerAddress; + config["TimeoutMs"] = 5000; + Json::Value optionalGoPipeline; + + if (!flusher->Init(config, optionalGoPipeline)) { + return nullptr; + } + flusher->CommitMetricsRecordRef(); + + if (!flusher->Start()) { + return nullptr; + } + return flusher; + } + + PipelineEventGroup MakeLogGroup() { + auto group = PipelineEventGroup(make_shared()); + auto* logEvent = group.AddLogEvent(true); + logEvent->SetTimestamp(1748313840, 259486017); + logEvent->SetContent(string("message"), string("Integration test log")); + group.SetTag(string("service.name"), string("integration-test")); + return group; + } +}; + +void PushToGrpcSinkUnittest::TestPushToGrpcSink_Success() { + auto flusher = CreateAndInitFlusher(); + APSARA_TEST_TRUE(flusher != nullptr); + + auto group = MakeLogGroup(); + string data; + string errMsg; + APSARA_TEST_TRUE(flusher->SerializeLogsToOTLP(group, data, errMsg)); + + // Create item that PushToGrpcSink will consume + auto item = make_unique( + std::move(data), data.size(), flusher.get(), 1, OTLPGrpcCallContext::DataType::Logs); + SenderQueueItem* rawItem = item.get(); + + // Call PushToGrpcSink directly + bool result = FlusherRunner::GetInstance()->PushToGrpcSink(rawItem, false); + APSARA_TEST_TRUE(result); + + // Wait for mock to process + this_thread::sleep_for(chrono::milliseconds(500)); + + // Verify mock captured the request + auto requests = GrpcSinkMock::GetInstance()->GetRequests(); + APSARA_TEST_TRUE(requests.size() >= 1); + + // Verify in-flight returned to 0 + APSARA_TEST_EQUAL(flusher->InFlightCount(), 0); +} + +void PushToGrpcSinkUnittest::TestPushToGrpcSink_BuildRequestFailure() { + auto flusher = CreateAndInitFlusher(); + APSARA_TEST_TRUE(flusher != nullptr); + + // Create an item with invalid data + string badData = "not-valid-protobuf"; + auto item = make_unique( + std::move(badData), badData.size(), flusher.get(), 1, OTLPGrpcCallContext::DataType::Logs); + SenderQueueItem* rawItem = item.get(); + + // BuildGrpcRequest should fail on invalid protobuf + bool result = FlusherRunner::GetInstance()->PushToGrpcSink(rawItem, false); + APSARA_TEST_FALSE(result); +} + +void PushToGrpcSinkUnittest::TestSinkTypeIsGrpc() { + auto flusher = CreateAndInitFlusher(); + APSARA_TEST_TRUE(flusher != nullptr); + APSARA_TEST_TRUE(flusher->GetSinkType() == SinkType::GRPC); +} + +UNIT_TEST_CASE(PushToGrpcSinkUnittest, TestPushToGrpcSink_Success) +UNIT_TEST_CASE(PushToGrpcSinkUnittest, TestPushToGrpcSink_BuildRequestFailure) +UNIT_TEST_CASE(PushToGrpcSinkUnittest, TestSinkTypeIsGrpc) + +} // namespace logtail + +UNIT_TEST_MAIN diff --git a/docker/Dockerfile_development_part b/docker/Dockerfile_development_part index 06cb0678fd..8383263a56 100644 --- a/docker/Dockerfile_development_part +++ b/docker/Dockerfile_development_part @@ -18,30 +18,36 @@ ARG HOST_OS=Linux ARG VERSION=0.0.1 USER root -WORKDIR /loongcollector +ENV container=docker -RUN mkdir -p /loongcollector/conf/instance_config/local -RUN mkdir -p /loongcollector/log -RUN mkdir -p /loongcollector/data -RUN mkdir -p /loongcollector/run +RUN yum update -y && yum -y install systemd initscripts && yum -y clean all && rm -fr /var/cache -COPY --from=build /src/core/build/loongcollector /loongcollector/ +RUN mkdir -p /usr/local/loongcollector/conf/instance_config/local +RUN mkdir -p /usr/local/loongcollector/log +RUN mkdir -p /usr/local/loongcollector/data +RUN mkdir -p /usr/local/loongcollector/run + +COPY --from=build /src/core/build/loongcollector /usr/local/loongcollector/ +COPY ./scripts/loongcollector_control.sh /usr/local/loongcollector/ COPY ./scripts/download_ebpflib.sh /tmp/ -RUN chown -R $(whoami) /loongcollector && \ - chmod 755 /loongcollector/loongcollector && \ - mkdir /loongcollector/data/checkpoint && \ - if [ `uname -m` = "x86_64" ]; then /tmp/download_ebpflib.sh /loongcollector; fi && \ +RUN chown -R $(whoami) /usr/local/loongcollector && \ + chmod 755 /usr/local/loongcollector/loongcollector && \ + mkdir -p /usr/local/loongcollector/data/checkpoint && \ + if [ `uname -m` = "x86_64" ]; then /tmp/download_ebpflib.sh /usr/local/loongcollector; fi && \ rm /tmp/download_ebpflib.sh -COPY --from=build /src/output/libGoPluginBase.so /loongcollector/ -COPY --from=build /src/example_config/quick_start/loongcollector_config.json /loongcollector/conf/instance_config/local/loongcollector_config.json -COPY --from=build /src/core/build/go_pipeline/libGoPluginAdapter.so /loongcollector/ -COPY --from=build /src/core/build/ebpf/driver/libeBPFDriver.so /loongcollector/ +COPY --from=build /src/output/libGoPluginBase.so /usr/local/loongcollector/ +COPY --from=build /src/example_config/quick_start/loongcollector_config.json /usr/local/loongcollector/conf/instance_config/local/loongcollector_config.json +COPY --from=build /src/core/build/go_pipeline/libGoPluginAdapter.so /usr/local/loongcollector/ +COPY --from=build /src/core/build/ebpf/driver/libeBPFDriver.so /usr/local/loongcollector/ -ENV HOST_OS=$HOST_OS -ENV LOGTAIL_VERSION=$VERSION +ENV HOST_OS=$HOST_OS \ + LOONGCOLLECTOR_VERSION=$VERSION \ + HTTP_PROBE_PORT=7953 \ + ALIYUN_LOGTAIL_USER_DEFINED_ID=default \ + docker_file_cache_path=checkpoint/docker_path_config.json EXPOSE 18689 -ENTRYPOINT ["/loongcollector/loongcollector"] +CMD ["/usr/local/loongcollector/loongcollector_control.sh", "start_and_block"] diff --git a/docs/cn/plugins/flusher/flushers.md b/docs/cn/plugins/flusher/flushers.md index 1c07d31a03..ef78c35045 100644 --- a/docs/cn/plugins/flusher/flushers.md +++ b/docs/cn/plugins/flusher/flushers.md @@ -22,6 +22,8 @@ LoongCollector 提供两类输出插件: | `flusher_blackhole`
[黑洞](native/flusher-blackhole.md) | SLS 官方 | 丢弃事件,常用于压测或占位。 | | `flusher_file`
[本地文件](native/flusher-file.md) | SLS 官方 | 将数据写入本地文件(如自监控指标落盘)。 | | `flusher_kafka_native`
[Kafka](native/flusher-kafka.md) | [ChaoEcho](https://github.com/ChaoEcho) | 使用 C++ 实现将数据输出到 Kafka。 | +| `flusher_otlp`
[OTLP gRPC](native/flusher-otlp.md) | SLS 官方 | 通过 OTLP/gRPC 协议发送 Logs/Metrics/Traces 到 OTel Collector。 | +| `flusher_otlp_http`
[OTLP HTTP](native/flusher-otlp-http.md) | SLS 官方 | 通过 OTLP/HTTP 协议发送 Logs/Metrics/Traces 到 OTel Collector。 | | `flusher_sls`
[SLS](native/flusher-sls.md) | SLS 官方 | 将数据写入阿里云日志服务(SLS)。 | | `router`
[多 Flusher 路由](native/router.md) | SLS 官方 | 在原生处理链路与支持的 Flusher 上按事件类型或 Tag 分流。 | diff --git a/docs/cn/plugins/flusher/native/flusher-otlp-http.md b/docs/cn/plugins/flusher/native/flusher-otlp-http.md new file mode 100644 index 0000000000..a084fa2b2f --- /dev/null +++ b/docs/cn/plugins/flusher/native/flusher-otlp-http.md @@ -0,0 +1,88 @@ +# OTLP (HTTP) + +## 简介 + +`flusher_otlp_http_native` `flusher`插件将内部 `PipelineEventGroup` 序列化为 OTLP 格式,通过 HTTP POST 发送至 OTel Collector 的 HTTP 端点。支持 `json` 和 `protobuf` 两种序列化格式。[源代码](https://github.com/alibaba/loongcollector/blob/main/core/plugin/flusher/opentelemetry/FlusherOTLPHttpNative.h) + +## 版本 + +[Alpha](../../stability-level.md) + +## 版本说明 + +* 推荐版本:LoongCollector v3.0.5 及以上 + +## 配置参数 + +| 参数 | 类型,默认值 | 说明 | +| - | - | - | +| Type | String,无默认值(必填) | 插件类型,固定为`flusher_otlp_http_native`。 | +| Url | String,无默认值(必填) | 目标 OTel Collector HTTP URL,例如`http://localhost:4318/v1/logs`。 | +| Format | String,`protobuf` | 序列化格式,可选值:`protobuf`(`application/x-protobuf`)、`json`(`application/json`)。 | +| EnableTLS | Bool,false | 是否启用 TLS 加密连接。 | +| Headers | Object,无 | 额外的 HTTP 自定义 Headers,键值对格式。 | + +## 数据流 + +``` +PipelineEventGroup (in) + → Send() + → Format=protobuf: SerializeAndPushProtobuf() + → OTLPEventGroupSerializer::SerializeToBinaryString() + → PipelineEventGroup → OTLP protobuf binary + → Format=json: SerializeAndPush() + → OTLPEventGroupSerializer::DoSerialize() + → PipelineEventGroup → OTLP protobuf → JSON string + → SenderQueueItem 创建 + PushToQueue + → HttpSink 消费队列 + → BuildRequest() 构造 HTTP POST (Content-Type: application/x-protobuf 或 application/json) + → OnSendDone() 处理响应 + → 2xx: 释放数据 + → 非 2xx: 保留重试 +``` + +## 样例 + +使用 protobuf 格式(默认)发送到 OTel Collector: + +```yaml +enable: true +inputs: + - Type: input_forward + Protocol: OTLP + Endpoint: 0.0.0.0:4316 +flushers: + - Type: flusher_otlp_http_native + Url: http://localhost:4318/v1/logs +``` + +使用 JSON 格式发送到 OTel Collector: + +```yaml +enable: true +inputs: + - Type: input_forward + Protocol: OTLP + Endpoint: 0.0.0.0:4316 +flushers: + - Type: flusher_otlp_http_native + Url: http://localhost:4318/v1/logs + Format: json +``` + +发送到 OTel Collector 并添加自定义 Headers 和 TLS: + +```yaml +enable: true +inputs: + - Type: input_file + FilePaths: + - /var/log/app/*.log +flushers: + - Type: flusher_otlp_http_native + Url: https://collector.example.com/v1/logs + EnableTLS: true + Headers: + Authorization: Bearer your-token-here + X-Custom-Header: custom-value +``` diff --git a/docs/cn/plugins/flusher/native/flusher-otlp.md b/docs/cn/plugins/flusher/native/flusher-otlp.md new file mode 100644 index 0000000000..f322e57a1b --- /dev/null +++ b/docs/cn/plugins/flusher/native/flusher-otlp.md @@ -0,0 +1,91 @@ +# OTLP (gRPC) + +## 简介 + +`flusher_otlp_native` `flusher`插件将内部 `PipelineEventGroup` 中的 Log、Metric、Span 事件通过 OTLP/gRPC 协议发送至外部 OTel Collector 或兼容端点。支持 Logs、Metrics、Traces 三种信号类型以及 RawEvent(作为 Log 发送)。[源代码](https://github.com/alibaba/loongcollector/blob/main/core/plugin/flusher/opentelemetry/FlusherOTLPNative.h) + +## 版本 + +[Alpha](../../stability-level.md) + +## 版本说明 + +* 推荐版本:LoongCollector v3.0.5 及以上 + +## 配置参数 + +| 参数 | 类型,默认值 | 说明 | +| - | - | - | +| Type | String,无默认值(必填) | 插件类型,固定为`flusher_otlp_native`。 | +| Endpoint | String,无默认值(必填) | 目标 OTel Collector gRPC 地址,格式为`host:port`,例如`localhost:4317`。 | +| TimeoutMs | Int,30000 | gRPC 请求超时时间,单位为毫秒。 | +| EnableTLS | Bool,false | 是否启用 TLS 加密连接。 | +| Headers | Object,无 | 额外的 gRPC 自定义 Headers,键值对格式。 | + +## 数据映射 + +### Logs + +| 内部 LogEvent 字段 | OTLP LogRecord 字段 | +|---|---| +| `GetTimestamp()` + `GetTimestampNanosecond()` | `set_time_unix_nano` | +| `GetContent("content")` | `mutable_body()->set_string_value` | +| `GetLevel()` | `set_severity_text` | +| 遍历 Content | `add_attributes()` | +| PipelineEventGroup Tags(真 tag) | `resource.attributes[]` | + +### Metrics + +| 内部 MetricEvent 字段 | OTLP Metric 字段 | +|---|---| +| `GetName()` | `set_name` | +| `GetValue()->mValue` | `mutable_gauge()->add_data_points()->set_as_double` | +| Tags | `data_points[].attributes[]` | + +### Traces + +| 内部 SpanEvent 字段 | OTLP Span 字段 | +|---|---| +| `GetTraceId()/GetSpanId()/GetParentSpanId()` | `set_trace_id/set_span_id/set_parent_span_id` | +| `GetStartTimeNs()/GetEndTimeNs()` | `set_start_time_unix_nano/set_end_time_unix_nano` | +| `GetStatus()` | `mutable_status()->set_code` | +| `GetKind()` | `set_kind` | +| Tags | `add_attributes()` | +| PipelineEventGroup Tags(真 tag) | `resource.attributes[]` | + +### RawEvents + +RawEvent 会被转换为 LogRecord 发送,其中 `GetContent()` 作为 log body。 + +## 样例 + +接收 OTLP 数据并转发到另一个 OTel Collector: + +```yaml +enable: true +inputs: + - Type: input_forward + Protocol: OTLP + Endpoint: 0.0.0.0:4316 +flushers: + - Type: flusher_otlp_native + Endpoint: localhost:4317 + TimeoutMs: 5000 + EnableTLS: false +``` + +发送日志到 OTel Collector 并添加自定义 Headers: + +```yaml +enable: true +inputs: + - Type: input_file + FilePaths: + - /var/log/app/*.log +flushers: + - Type: flusher_otlp_native + Endpoint: collector.example.com:4317 + EnableTLS: true + Headers: + Authorization: Bearer your-token-here +``` diff --git a/docs/cn/plugins/input/native/input-forward.md b/docs/cn/plugins/input/native/input-forward.md index 4c9440cc27..b465ded582 100644 --- a/docs/cn/plugins/input/native/input-forward.md +++ b/docs/cn/plugins/input/native/input-forward.md @@ -2,7 +2,7 @@ ## 简介 -`input_forward` 插件用于接收来自其他系统的数据转发请求,目前支持LoongSuite协议。该插件可以作为数据转发的接收端,通过配置的匹配规则来处理接收到的数据。[源代码](https://github.com/alibaba/loongcollector/blob/main/core/plugin/input/InputForward.h) +`input_forward` 插件用于接收来自其他系统的数据转发请求,支持 LoongSuite 和 OTLP 协议。该插件可以作为数据转发的接收端,通过配置的匹配规则来处理接收到的数据。[源代码](https://github.com/alibaba/loongcollector/blob/main/core/plugin/input/InputForward.h) ## 版本 @@ -17,7 +17,7 @@ | 参数 | 类型,默认值 | 说明 | | - | - | - | | Type | String,无默认值(必填) | 插件类型,固定为`input_forward`。 | -| Protocol | String,无默认值(必填) | 转发协议类型。目前支持:`LoongSuite`。 | +| Protocol | String,无默认值(必填) | 转发协议类型。目前支持:`LoongSuite`、`OTLP`。 | | Endpoint | String,无默认值(必填) | 监听地址和端口,格式为`IP:PORT`,例如`0.0.0.0:7899`。或者本地通信socket,例如`/root/loongcollector.sock`。 | ## 转发规则 @@ -68,3 +68,77 @@ flushers: "__time__": "1642502400" } ``` + +### OTLP + +接收标准 OpenTelemetry Protocol (OTLP/gRPC) 的 Logs、Metrics、Traces 数据,将其转换为内部 `PipelineEventGroup` 格式并推入采集管道。 + +#### 数据映射 + +**Logs:** +- `time_unix_nano` → LogEvent 时间戳(秒 + 亚秒纳秒) +- `body.string_value` → `SetContent("content", ...)` +- `severity_text` → `SetContent("severity", ...)` +- `trace_id`, `span_id` → `SetContent("trace_id"/"span_id", ...)` +- `log_record.attributes[]` → `SetContent(key, value)` +- `scope.attributes[]` → `SetContent("__tag__:" + key, value)`(假 tag,scope 层不常用) +- `resource.attributes[]` → `eventGroup.SetTag(key, value)`(真 tag) + +**Metrics:** +- `name` → `SetName(...)` +- `gauge/sum/histogram.data_points[].as_double()` → `SetValue(...)` +- `scope.attributes[]` → `SetTag("__tag__:" + key, value)`(假 tag) +- `resource.attributes[]` → `eventGroup.SetTag(key, value)`(真 tag) + +**Traces:** +- `trace_id`, `span_id`, `parent_span_id` → `SetTraceId/SetSpanId/SetParentSpanId` +- `name` → `SetName` +- `start_time_unix_nano`, `end_time_unix_nano` → `SetStartTimeNs/SetEndTimeNs` +- `status.code` → `SetStatus(Ok/Error/Unset)` +- `kind` → `SetKind(Client/Server/...)` +- `span.attributes[]` → `SetTag(key, value)` +- `scope.attributes[]` → `SetTag("__tag__:" + key, value)`(假 tag) +- `resource.attributes[]` → `eventGroup.SetTag(key, value)`(真 tag) + +#### 配置匹配 + +服务通过 gRPC metadata 中的 `x-otlp-apm-configname` 字段匹配采集配置。如果仅注册了一个配置(如 onetime pipeline 场景),则自动匹配该配置。 + +返回状态码: +* `OK`:转发成功 +* `NOT_FOUND`:没有匹配的采集配置 +* `INVALID_ARGUMENT`:请求参数错误(如空请求体) +* `UNAVAILABLE`:队列已满,请重试 + +#### 样例 + +监听 4316 端口的 OTLP gRPC 请求,将数据转发到下游处理: + +```yaml +enable: true +inputs: + - Type: input_forward + Protocol: OTLP + Endpoint: 0.0.0.0:4316 +flushers: + - Type: flusher_sls + Project: "your-project" + Logstore: "your-logstore" + Region: cn-shanghai + Endpoint: cn-shanghai.log.aliyuncs.com +``` + +结合 `flusher_otlp` 实现 OTLP 接收-转发全链路: + +```yaml +enable: true +inputs: + - Type: input_forward + Protocol: OTLP + Endpoint: 0.0.0.0:4316 +flushers: + - Type: flusher_otlp + Endpoint: localhost:4317 + TimeoutMs: 5000 + EnableTLS: false +``` diff --git a/test/e2e/test_cases/flusher_otlp_http_native/case.feature b/test/e2e/test_cases/flusher_otlp_http_native/case.feature new file mode 100644 index 0000000000..bbdaf9e277 --- /dev/null +++ b/test/e2e/test_cases/flusher_otlp_http_native/case.feature @@ -0,0 +1,66 @@ +@flusher +Feature: flusher OTLP HTTP native + Test flusher_otlp_http_native sending OTLP data via HTTP to an OTel Collector + + @e2e @docker-compose + Scenario: TestFlusherOTLPHttpNativeLogs + Given {docker-compose} environment + Given {otlp-http-input-forward-case} local config as below + """ + enable: true + inputs: + - Type: input_forward + Protocol: OTLP + Endpoint: "0.0.0.0:4320" + flushers: + - Type: flusher_otlp_http_native + Url: "http://otel-collector:4318/v1/logs" + Format: "protobuf" + """ + When start docker-compose {flusher_otlp_http_native} + Then wait {10} seconds + When generate {1} OTLP {logs} via otelgen to endpoint {loongcollectorC:4320}, protocol {grpc} + Then wait {5} seconds + Then otlp collector received at least {1} logs from file {/tmp/otel-export/logs.json} + + @e2e @docker-compose + Scenario: TestFlusherOTLPHttpNativeMetrics + Given {docker-compose} environment + Given {otlp-http-metric-forward-case} local config as below + """ + enable: true + inputs: + - Type: input_forward + Protocol: OTLP + Endpoint: "0.0.0.0:4321" + flushers: + - Type: flusher_otlp_http_native + Url: "http://otel-collector:4318/v1/metrics" + Format: "protobuf" + """ + When start docker-compose {flusher_otlp_http_native} + Then wait {10} seconds + When generate {1} OTLP {metrics} via otelgen to endpoint {loongcollectorC:4321}, protocol {grpc} + Then wait {5} seconds + Then otlp collector received at least {1} metrics from file {/tmp/otel-export/metrics.json} + + @e2e @docker-compose + Scenario: TestFlusherOTLPHttpNativeTraces + Given {docker-compose} environment + Given {otlp-http-trace-forward-case} local config as below + """ + enable: true + inputs: + - Type: input_forward + Protocol: OTLP + Endpoint: "0.0.0.0:4322" + flushers: + - Type: flusher_otlp_http_native + Url: "http://otel-collector:4318/v1/traces" + Format: "protobuf" + """ + When start docker-compose {flusher_otlp_http_native} + Then wait {10} seconds + When generate {1} OTLP {traces} via otelgen to endpoint {loongcollectorC:4322}, protocol {grpc} + Then wait {5} seconds + Then otlp collector received at least {1} traces from file {/tmp/otel-export/traces.json} diff --git a/test/e2e/test_cases/flusher_otlp_http_native/docker-compose.yaml b/test/e2e/test_cases/flusher_otlp_http_native/docker-compose.yaml new file mode 100644 index 0000000000..949ed71d1e --- /dev/null +++ b/test/e2e/test_cases/flusher_otlp_http_native/docker-compose.yaml @@ -0,0 +1,31 @@ +# Copyright 2025 iLogtail Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +services: + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + hostname: otel-collector + user: "0:0" + ports: + - "4317" + - "4318" + volumes: + - ./otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml + - ./otel-export:/tmp/otel-export + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:13133/"] + interval: 5s + timeout: 3s + retries: 5 + start_period: 10s diff --git a/test/e2e/test_cases/flusher_otlp_http_native/otel-collector-config.yaml b/test/e2e/test_cases/flusher_otlp_http_native/otel-collector-config.yaml new file mode 100644 index 0000000000..3f9566ddfa --- /dev/null +++ b/test/e2e/test_cases/flusher_otlp_http_native/otel-collector-config.yaml @@ -0,0 +1,48 @@ +# Copyright 2025 iLogtail Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +exporters: + debug: + verbosity: detailed + file/logs: + path: /tmp/otel-export/logs.json + file/metrics: + path: /tmp/otel-export/metrics.json + file/traces: + path: /tmp/otel-export/traces.json + +extensions: + health_check: + endpoint: 0.0.0.0:13133 + +service: + extensions: [health_check] + pipelines: + logs: + receivers: [otlp] + exporters: [debug, file/logs] + metrics: + receivers: [otlp] + exporters: [debug, file/metrics] + traces: + receivers: [otlp] + exporters: [debug, file/traces] diff --git a/test/e2e/test_cases/flusher_otlp_native/case.feature b/test/e2e/test_cases/flusher_otlp_native/case.feature new file mode 100644 index 0000000000..d4719398af --- /dev/null +++ b/test/e2e/test_cases/flusher_otlp_native/case.feature @@ -0,0 +1,69 @@ +@flusher +Feature: flusher OTLP native gRPC + Test flusher_otlp_native sending OTLP data via gRPC to an OTel Collector + + @e2e @docker-compose + Scenario: TestFlusherOTLPNativeLogs + Given {docker-compose} environment + Given {otlp-grpc-input-forward-case} local config as below + """ + enable: true + inputs: + - Type: input_forward + Protocol: OTLP + Endpoint: "0.0.0.0:4320" + flushers: + - Type: flusher_otlp_native + Endpoint: "otel-collector:4317" + TimeoutMs: 5000 + EnableTLS: false + """ + When start docker-compose {flusher_otlp_native} + Then wait {10} seconds + When generate {1} OTLP {logs} via otelgen to endpoint {loongcollectorC:4320}, protocol {grpc} + Then wait {5} seconds + Then otlp collector received at least {1} logs from file {/tmp/otel-export/logs.json} + + @e2e @docker-compose + Scenario: TestFlusherOTLPNativeMetrics + Given {docker-compose} environment + Given {otlp-grpc-metric-forward-case} local config as below + """ + enable: true + inputs: + - Type: input_forward + Protocol: OTLP + Endpoint: "0.0.0.0:4321" + flushers: + - Type: flusher_otlp_native + Endpoint: "otel-collector:4317" + TimeoutMs: 5000 + EnableTLS: false + """ + When start docker-compose {flusher_otlp_native} + Then wait {10} seconds + When generate {1} OTLP {metrics} via otelgen to endpoint {loongcollectorC:4321}, protocol {grpc} + Then wait {5} seconds + Then otlp collector received at least {1} metrics from file {/tmp/otel-export/metrics.json} + + @e2e @docker-compose + Scenario: TestFlusherOTLPNativeTraces + Given {docker-compose} environment + Given {otlp-grpc-trace-forward-case} local config as below + """ + enable: true + inputs: + - Type: input_forward + Protocol: OTLP + Endpoint: "0.0.0.0:4322" + flushers: + - Type: flusher_otlp_native + Endpoint: "otel-collector:4317" + TimeoutMs: 5000 + EnableTLS: false + """ + When start docker-compose {flusher_otlp_native} + Then wait {10} seconds + When generate {1} OTLP {traces} via otelgen to endpoint {loongcollectorC:4322}, protocol {grpc} + Then wait {5} seconds + Then otlp collector received at least {1} traces from file {/tmp/otel-export/traces.json} diff --git a/test/e2e/test_cases/flusher_otlp_native/docker-compose.yaml b/test/e2e/test_cases/flusher_otlp_native/docker-compose.yaml new file mode 100644 index 0000000000..9d8fa45fb3 --- /dev/null +++ b/test/e2e/test_cases/flusher_otlp_native/docker-compose.yaml @@ -0,0 +1,30 @@ +# Copyright 2025 iLogtail Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +services: + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + hostname: otel-collector + user: "0:0" + ports: + - "4317" + volumes: + - ./otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml + - ./otel-export:/tmp/otel-export + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:13133/"] + interval: 5s + timeout: 3s + retries: 5 + start_period: 10s diff --git a/test/e2e/test_cases/flusher_otlp_native/otel-collector-config.yaml b/test/e2e/test_cases/flusher_otlp_native/otel-collector-config.yaml new file mode 100644 index 0000000000..30b827c826 --- /dev/null +++ b/test/e2e/test_cases/flusher_otlp_native/otel-collector-config.yaml @@ -0,0 +1,46 @@ +# Copyright 2025 iLogtail Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + +exporters: + debug: + verbosity: detailed + file/logs: + path: /tmp/otel-export/logs.json + file/metrics: + path: /tmp/otel-export/metrics.json + file/traces: + path: /tmp/otel-export/traces.json + +extensions: + health_check: + endpoint: 0.0.0.0:13133 + +service: + extensions: [health_check] + pipelines: + logs: + receivers: [otlp] + exporters: [debug, file/logs] + metrics: + receivers: [otlp] + exporters: [debug, file/metrics] + traces: + receivers: [otlp] + exporters: [debug, file/traces] diff --git a/test/engine/setup/controller/docker_compose_boot.go b/test/engine/setup/controller/docker_compose_boot.go index 723a719d27..0ab94659e6 100644 --- a/test/engine/setup/controller/docker_compose_boot.go +++ b/test/engine/setup/controller/docker_compose_boot.go @@ -17,6 +17,7 @@ package controller import ( "context" "os" + "path/filepath" "time" "github.com/alibaba/ilogtail/pkg/logger" @@ -24,6 +25,28 @@ import ( "github.com/alibaba/ilogtail/test/engine/setup/dockercompose" ) +// ensureFlusherFile ensures the FlusherFile exists as a regular file before +// docker-compose starts. If the path does not exist, Docker would create it as +// a directory when bind-mounting, which breaks LoongCollector startup. +func ensureFlusherFile() error { + if config.FlusherFile == "" { + return nil + } + info, err := os.Stat(config.FlusherFile) + if err == nil && !info.IsDir() { + return nil + } + if err == nil && info.IsDir() { + if removeErr := os.RemoveAll(config.FlusherFile); removeErr != nil { + return removeErr + } + } + if err := os.MkdirAll(filepath.Dir(config.FlusherFile), 0750); err != nil { + return err + } + return os.WriteFile(config.FlusherFile, []byte("{}"), 0600) +} + type BootController struct { } @@ -56,6 +79,10 @@ func (c *BootController) Start(ctx context.Context) error { logger.Error(context.Background(), "BOOT_START_ALARM", "err", err) return err } + if err := ensureFlusherFile(); err != nil { + logger.Error(context.Background(), "BOOT_START_ALARM", "err", err) + return err + } if err := dockercompose.Start(ctx); err != nil { logger.Error(context.Background(), "BOOT_START_ALARM", "err", err) return err diff --git a/test/engine/setup/docker_compose.go b/test/engine/setup/docker_compose.go index 1e8e76ff55..9f6d7cddcb 100644 --- a/test/engine/setup/docker_compose.go +++ b/test/engine/setup/docker_compose.go @@ -54,7 +54,9 @@ func StartDockerComposeEnv(ctx context.Context, dependencyName string) (context. logger.Error(ctx, "BOOT_START_ALARM", "err", err) return ctx, err } - return context.WithValue(ctx, config.StartTimeContextKey, int32(startTime)), nil + ctx = context.WithValue(ctx, config.StartTimeContextKey, int32(startTime)) + // Set agent PID after container starts + return SetAgentPID(ctx) } return ctx, fmt.Errorf("env is not docker-compose") } diff --git a/test/engine/setup/env.go b/test/engine/setup/env.go index 49b375c595..60236c272e 100644 --- a/test/engine/setup/env.go +++ b/test/engine/setup/env.go @@ -38,7 +38,11 @@ func InitEnv(ctx context.Context, envType string) (context.Context, error) { case "deployment": Env = NewDeploymentEnv() } - return SetAgentPID(ctx) + if envType == "host" || envType == "daemonset" || envType == "deployment" { + return SetAgentPID(ctx) + } + // For docker-compose, container hasn't started yet, PID will be set after StartDockerComposeEnv + return ctx, nil } func Mkdir(ctx context.Context, dir string) (context.Context, error) { diff --git a/test/engine/steps.go b/test/engine/steps.go index 8437d5c2a2..ce7f3ea2d0 100644 --- a/test/engine/steps.go +++ b/test/engine/steps.go @@ -16,6 +16,7 @@ import ( "github.com/alibaba/ilogtail/test/engine/trigger" "github.com/alibaba/ilogtail/test/engine/trigger/ebpf" "github.com/alibaba/ilogtail/test/engine/trigger/log" + "github.com/alibaba/ilogtail/test/engine/trigger/otlp" "github.com/alibaba/ilogtail/test/engine/verify" ) @@ -79,6 +80,9 @@ func ScenarioInitializer(ctx *godog.ScenarioContext) { ctx.When(`^wait monitor until log processing finished$`, monitor.WaitMonitorUntilProcessingFinished) ctx.When(`^change log rotate interval to \{(\d+)\}s$`, log.ChangeRotateInterval) + // otlp + ctx.When(`^generate \{(\d+)\} OTLP \{(logs|metrics|traces)\} via otelgen to endpoint \{(.*)\}, protocol \{(grpc|http)\}$`, otlp.OtelgenSend) + // ebpf ctx.When(`^execute \{(\d+)\} commands \{(.*)\} in parallel`, ebpf.ExecveCommandsParallel) ctx.When(`^execute \{(\d+)\} commands \{(.*)\} in sequence`, ebpf.ExecveCommandsSerial) @@ -113,6 +117,9 @@ func ScenarioInitializer(ctx *godog.ScenarioContext) { ctx.Then(`^the logtail log contains \{(\d+)\} times of \{(.*)\}$`, verify.LogtailPluginLog) ctx.Then(`^the log is in order$`, verify.LogOrder) + // otlp collector verification + ctx.Then(`^otlp collector received at least \{(\d+)\} (logs|metrics|traces) from file \{(.*)\}$`, verify.OTLPCollectorReceived) + // metric ctx.Then(`^there is more than \{(\d+)\} metrics in \{(\d+)\} seconds$`, verify.MetricCount) @@ -129,7 +136,11 @@ func ScenarioInitializer(ctx *godog.ScenarioContext) { return ctx, nil }) ctx.After(func(ctx context.Context, sc *godog.Scenario, err error) (context.Context, error) { + ctx, verifyErr := verify.AgentNotCrash(ctx) cleanup.All() - return verify.AgentNotCrash(ctx) + if verifyErr != nil { + return ctx, verifyErr + } + return ctx, nil }) } diff --git a/test/engine/trigger/otlp/otelgen.go b/test/engine/trigger/otlp/otelgen.go new file mode 100644 index 0000000000..b7a804632f --- /dev/null +++ b/test/engine/trigger/otlp/otelgen.go @@ -0,0 +1,134 @@ +// Copyright 2025 iLogtail Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package otlp + +import ( + "context" + "fmt" + "os/exec" + "strings" + "time" + + "github.com/alibaba/ilogtail/pkg/logger" +) + +const otelgenImage = "ghcr.io/krzko/otelgen:latest" + +// OtelgenSend generates OTLP data (logs, metrics, or traces) using otelgen +// via `docker run`, sending to the specified endpoint within the docker-compose network. +// The endpoint is resolved against the compose network (e.g., "loongcollector:4320"). +func OtelgenSend(ctx context.Context, count int, dataType string, endpoint string, protocol string) (context.Context, error) { + // Find the docker network used by the compose project + network, err := findComposeNetwork() + if err != nil { + return ctx, fmt.Errorf("failed to find compose network: %v", err) + } + + // Build otelgen args + var args []string + args = append(args, "--otel-exporter-otlp-endpoint", endpoint, "--insecure") + // Use --protocol flag for http transport (supported by all subcommands) + if protocol == "http" { + args = append(args, "--protocol", "http") + } + // logs/traces use 'single' subcommand (one-shot); metrics uses 'sum' (runs continuously, killed by context timeout) + if dataType == "metrics" { + args = append(args, dataType, "sum") + } else { + args = append(args, dataType, "single") + } + + logger.Infof(ctx, "running otelgen on network %s: %s %s", network, otelgenImage, strings.Join(args, " ")) + + for i := 0; i < count; i++ { + dockerArgs := append([]string{"run", "--rm", "--network", network, "--", otelgenImage}, args...) + var cmd *exec.Cmd + var cancel func() + if dataType == "metrics" { + // Metrics subcommand runs continuously; use context timeout to kill after 20s + // to allow at least 2 batch cycles (5s each) for cumulative metrics + timeoutCtx, cancelFn := context.WithTimeout(ctx, 20*time.Second) + cancel = cancelFn + cmd = exec.CommandContext(timeoutCtx, "docker", dockerArgs...) + } else { + cmd = exec.Command("docker", dockerArgs...) + } + output, err := cmd.CombinedOutput() + if cancel != nil { + cancel() + } + if err != nil { + // For metrics, the context timeout kills the process with "signal: killed" + // which is expected behavior, not an error + if dataType == "metrics" && isTimeoutError(err) { + logger.Infof(ctx, "otelgen metrics run killed by timeout (expected), output: %s", string(output)) + } else { + logger.Errorf(ctx, "OTELGEN_SEND_ERROR", + "iteration", i, "output", string(output), "err", err) + return ctx, fmt.Errorf("otelgen failed on iteration %d: %v, output: %s", i, err, string(output)) + } + } else { + logger.Infof(ctx, "otelgen iteration %d output: %s", i, string(output)) + } + // Small delay between iterations + if count > 1 && i < count-1 { + time.Sleep(500 * time.Millisecond) + } + } + + return ctx, nil +} + +// findComposeNetwork discovers the docker-compose network by inspecting any running +// container whose name contains "otel-collector" or "loongcollectorC". +// isTimeoutError checks if the error is from a context deadline exceeded or signal killed +func isTimeoutError(err error) bool { + if err == nil { + return false + } + errStr := err.Error() + return strings.Contains(errStr, "signal: killed") || strings.Contains(errStr, "context deadline exceeded") +} + +func findComposeNetwork() (string, error) { + // Find a container belonging to the compose project + for _, pattern := range []string{"otel-collector", "loongcollectorC"} { + cmd := exec.Command("docker", "ps", "--filter", "name="+pattern, "--format", "{{.ID}}") + output, err := cmd.CombinedOutput() + if err != nil { + continue + } + + containerID := strings.TrimSpace(string(output)) + if containerID == "" { + continue + } + + // Inspect the container's networks + cmd = exec.Command("docker", "inspect", "--format", + `{{range $k, $v := .NetworkSettings.Networks}}{{$k}}{{end}}`, containerID) + output, err = cmd.CombinedOutput() + if err != nil { + continue + } + + network := strings.TrimSpace(string(output)) + if network != "" { + return network, nil + } + } + + return "", fmt.Errorf("no compose network found (no running otel-collector or loongcollectorC container)") +} diff --git a/test/engine/verify/otlp_collector.go b/test/engine/verify/otlp_collector.go new file mode 100644 index 0000000000..91777b4b8e --- /dev/null +++ b/test/engine/verify/otlp_collector.go @@ -0,0 +1,185 @@ +// Copyright 2025 iLogtail Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package verify + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/avast/retry-go/v4" + + "github.com/alibaba/ilogtail/pkg/logger" + "github.com/alibaba/ilogtail/test/config" +) + +// OTLPCollectorData represents the top-level structure of OTel Collector file exporter output. +// The file exporter writes one JSON object per line (JSON Lines format). +type OTLPCollectorData struct { + ResourceLogs []interface{} `json:"resourceLogs,omitempty"` + ResourceMetrics []interface{} `json:"resourceMetrics,omitempty"` + ResourceSpans []interface{} `json:"resourceSpans,omitempty"` +} + +// OTLPCollectorReceived verifies that the OTel Collector's file exporter has received +// at least the expected count of a given data type (logs, metrics, traces). +// It reads the file from the local bind-mounted otel-export directory. +func OTLPCollectorReceived(ctx context.Context, expect int, dataType string, filePath string) (context.Context, error) { + timeoutCtx, cancel := context.WithTimeout(context.TODO(), config.TestConfig.RetryTimeout) + defer cancel() + + // Resolve the local file path from the container path. + // The case.feature passes a container path like /tmp/otel-export/logs.json. + // Since we use a bind mount to ./otel-export/ in the case directory, + // we look for the file relative to the current case home. + localFile := resolveLocalPath(config.CaseHome, filePath) + + var count int + var err error + + err = retry.Do( + func() error { + count, err = countOTLPFileRecords(localFile, dataType) + if err != nil { + return fmt.Errorf("failed to count OTLP %s: %v", dataType, err) + } + if count < expect { + return fmt.Errorf("otlp collector %s count not match, expect at least %d, got %d", dataType, expect, count) + } + return nil + }, + retry.Context(timeoutCtx), + retry.Delay(5*time.Second), + retry.DelayType(retry.FixedDelay), + ) + if err != nil { + return ctx, err + } + return ctx, nil +} + +// resolveLocalPath converts a container-internal path to the local bind-mounted path. +// For example: /tmp/otel-export/logs.json -> /otel-export/logs.json +func resolveLocalPath(caseHome, containerPath string) string { + // Extract the relative path from the known mount target /tmp/otel-export/ + relPath := strings.TrimPrefix(containerPath, "/tmp/otel-export/") + if relPath == containerPath { + // Not under /tmp/otel-export, try /tmp/otel-export + relPath = strings.TrimPrefix(containerPath, "/tmp/otel-export/") + } + localFile := filepath.Join(caseHome, "otel-export", relPath) + logger.Debugf(context.Background(), "resolved OTLP file path: %s -> %s", containerPath, localFile) + return localFile +} + +// countOTLPFileRecords reads the file exporter output and counts records of the given type. +func countOTLPFileRecords(filePath string, dataType string) (int, error) { + cleanPath, err := validateOTLPExportPath(filePath) + if err != nil { + return 0, err + } + + // Read and parse the JSON Lines file + // #nosec G304 -- path is validated by validateOTLPExportPath before opening. + file, err := os.Open(cleanPath) + if err != nil { + // File may not exist yet (no data flushed) + if os.IsNotExist(err) { + return 0, nil + } + return 0, fmt.Errorf("failed to open file: %v", err) + } + defer file.Close() + + count := 0 + scanner := bufio.NewScanner(file) + // Increase buffer size for potentially large lines + scanner.Buffer(make([]byte, 1024*1024), 10*1024*1024) + + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + + var data OTLPCollectorData + if err := json.Unmarshal([]byte(line), &data); err != nil { + // Try parsing as array format + count += countFromArray(line, dataType) + continue + } + + switch dataType { + case "logs": + count += len(data.ResourceLogs) + case "metrics": + count += len(data.ResourceMetrics) + case "traces": + count += len(data.ResourceSpans) + } + } + + if err := scanner.Err(); err != nil { + return 0, fmt.Errorf("scanner error: %v", err) + } + + return count, nil +} + +func validateOTLPExportPath(filePath string) (string, error) { + baseDir := filepath.Clean(filepath.Join(config.CaseHome, "otel-export")) + cleanPath := filepath.Clean(filePath) + + relPath, err := filepath.Rel(baseDir, cleanPath) + if err != nil { + return "", fmt.Errorf("failed to resolve otlp file path: %v", err) + } + if relPath == ".." || strings.HasPrefix(relPath, ".."+string(filepath.Separator)) { + return "", fmt.Errorf("otlp file path is outside allowed directory: %s", cleanPath) + } + return cleanPath, nil +} + +// countFromArray tries to parse the line as an array and count items. +func countFromArray(line string, dataType string) int { + var arr []map[string]interface{} + if err := json.Unmarshal([]byte(line), &arr); err != nil { + return 0 + } + + count := 0 + for _, item := range arr { + switch dataType { + case "logs": + if _, ok := item["resourceLogs"]; ok { + count++ + } + case "metrics": + if _, ok := item["resourceMetrics"]; ok { + count++ + } + case "traces": + if _, ok := item["resourceSpans"]; ok { + count++ + } + } + } + return count +}