diff --git a/.changeset/glob-ignore-files.md b/.changeset/glob-ignore-files.md new file mode 100644 index 000000000..0758faf87 --- /dev/null +++ b/.changeset/glob-ignore-files.md @@ -0,0 +1,5 @@ +--- +"@moonshot-ai/kimi-code": patch +--- + +Fix glob file searches so all patterns continue to respect ignore files. A positive ripgrep `--glob` always overrides ignore logic, so any non-broad user pattern (e.g. `*.ts`, `dist/**/*.js`) could re-include files excluded by `.gitignore`. The pattern is now filtered in-process after `rg --files` enumerates non-ignored files. Also fixes several glob edge cases: the search path is always `.` so derived prefixes cannot override ignore rules or escape the authorized tree; `*` and `?` before literal parentheses are preserved as wildcards (not treated as extglob prefixes); range braces `{N..M}` match rg's single-alternative behavior; malformed character classes (`[]`, `[!]`, `[z-a]`, dangling `\`) are rejected before running ripgrep; and files whose names start with `..` are no longer dropped as escapes. diff --git a/packages/agent-core/src/tools/builtin/file/glob.ts b/packages/agent-core/src/tools/builtin/file/glob.ts index 69d01c602..7b71e3710 100644 --- a/packages/agent-core/src/tools/builtin/file/glob.ts +++ b/packages/agent-core/src/tools/builtin/file/glob.ts @@ -16,7 +16,7 @@ * files (e.g. build outputs, `node_modules`). Sensitive files such * as `.env` are always filtered out. * - Brace expansion (`*.{ts,tsx}`, `{src,test}/**`) is handled by - * ripgrep's glob engine. + * picomatch in-process. * - `path` is validated by `resolvePathAccess` in `absolute-outside-allowed` * mode. Explicit absolute paths outside the workspace are allowed; relative * paths that escape the workspace stay rejected. @@ -25,7 +25,8 @@ */ import type { Kaos } from '@moonshot-ai/kaos'; -import { normalize, resolve } from 'pathe'; +import { dirname, isAbsolute, join, normalize, relative, resolve } from 'pathe'; +import picomatch from 'picomatch'; import { z } from 'zod'; import type { BuiltinTool } from '../../../agent/tool'; @@ -160,6 +161,11 @@ export class GlobTool implements BuiltinTool { ): Promise { const searchRoot = searchRoots[0] ?? this.workspace.workspaceDir; + const patternError = validateGlobPattern(args.pattern); + if (patternError !== undefined) { + return { isError: true, output: patternError }; + } + // Validate the search root is a directory. `rg --files ` exits 0 // and lists the file itself, so without this check a file root would be // returned as its own match instead of rejected. A missing root surfaces @@ -201,19 +207,23 @@ export class GlobTool implements BuiltinTool { // Running from the search root makes glob matching relative to it. const execKaos = this.kaos.withCwd(searchRoot); - let runResult = await runRipgrepOnce( - execKaos, - buildRgArgs(rgPath, args), - signal, - { abortedMessage: 'Glob aborted' }, - ); + const insideGitRepo = + args.include_ignored === true ? true : await isInsideGitRepo(this.kaos, searchRoot); + + const pathClass = this.kaos.pathClass(); + const lineFilter = makeLineFilter(args.pattern, pathClass, searchRoot); + + let runResult = await runRipgrepOnce(execKaos, buildRgArgs(rgPath, args, insideGitRepo), signal, { + abortedMessage: 'Glob aborted', + lineFilter, + }); if (runResult.kind === 'tool-error') return runResult.result; if (shouldRetryRipgrepEagain(runResult)) { runResult = await runRipgrepOnce( execKaos, - buildRgArgs(rgPath, args, true), + buildRgArgs(rgPath, args, insideGitRepo, true), signal, - { abortedMessage: 'Glob aborted' }, + { abortedMessage: 'Glob aborted', lineFilter }, ); if (runResult.kind === 'tool-error') return runResult.result; } @@ -250,10 +260,19 @@ export class GlobTool implements BuiltinTool { resolve(searchRoot, p), ); + // Filter by the user's positive glob pattern in-process. A positive + // --glob would override ignore-file logic, so the pattern is not passed + // to rg; instead rg --files enumerates non-ignored files and we filter + // here to preserve both the pattern match and the ignore-file respect. + // The streaming lineFilter already applied this filter before the cap, + // but we re-check here as a safety net for any paths that slipped + // through (e.g. broad patterns where the filter is skipped). + const patternMatched = filterByPattern(rawPaths, searchRoot, args.pattern, pathClass); + // Authoritative sensitive-file check (the rg prefilter is conservative). const kept: string[] = []; let filteredSensitive = 0; - for (const p of rawPaths) { + for (const p of patternMatched) { if (isSensitiveFile(p)) { filteredSensitive++; } else { @@ -277,7 +296,6 @@ export class GlobTool implements BuiltinTool { // save tokens, but only for the primary workspace. Relative paths are // later resolved against workspaceDir, so additionalDir matches stay // absolute to keep follow-up Read/Edit calls on the same file. - const pathClass = this.kaos.pathClass(); const shouldRelativize = isWithinDirectory(searchRoot, this.workspace.workspaceDir, pathClass); const displayLines = limited.map((p) => shouldRelativize ? relativizeIfUnder(p, searchRoot, pathClass) : p, @@ -312,26 +330,559 @@ export class GlobTool implements BuiltinTool { } } -function buildRgArgs(rgPath: string, args: GlobInput, singleThreaded = false): string[] { +function buildRgArgs( + rgPath: string, + args: GlobInput, + insideGitRepo: boolean, + singleThreaded = false, +): string[] { const cmd: string[] = [rgPath]; if (singleThreaded) cmd.push('-j', '1'); cmd.push('--files', '--hidden', '--sortr=modified'); + if (!insideGitRepo && args.include_ignored !== true) { + cmd.push('--no-require-git'); + } for (const dir of VCS_DIRECTORIES_TO_EXCLUDE) { cmd.push('--glob', `!${dir}`); } - // Positive pattern first, then sensitive-file exclusions so a broad - // pattern cannot re-include a sensitive path. - cmd.push('--glob', args.pattern); + // The user's positive pattern is NOT passed as --glob. A positive --glob + // "always overrides any other ignore logic" (ripgrep docs), so it would + // re-include files excluded by .gitignore/.ignore/.rgignore. Instead, let + // rg --files enumerate non-ignored files and filter the results + // in-process via matchUserPattern(). for (const glob of SENSITIVE_GLOBS_TO_EXCLUDE) { cmd.push('--glob', `!${glob}`); } if (args.include_ignored) cmd.push('--no-ignore'); // Search path is `.` because the process cwd is pinned to the search root - // (see execution()); this keeps `--glob` matching relative to that root. + // (see execution()). Passing a derived subdirectory as the rg PATH would + // override ignore rules — rg treats command-line paths as authoritative, + // so `rg --files dist` traverses a gitignored `dist/` even with `--glob + // !dist`. It would also allow patterns like `../outside/**` to escape the + // authorized search tree, and error out on non-existent prefixes. The + // streaming line filter (makeLineFilter) already prevents the output cap + // from being exhausted by non-matching paths, so narrowing the traversal + // is not needed for correctness. cmd.push('.'); return cmd; } +function isBroadPattern(pattern: string): boolean { + const preprocessed = preprocessGitignoreGlobPattern(pattern); + if (preprocessed === undefined) return true; + pattern = preprocessed; + // rg treats an empty --glob as matching all files (respecting ignores), + // so skip picomatch compilation for it — picomatch throws on empty input. + // A leading `!` (rg's exclusion marker) followed by nothing means + // "exclude the empty glob" → no files → NOT broad. `!*` or `!**/*` also + // needs compilation to negate. + if (pattern.startsWith('!')) return false; + return isBroadPositivePattern(pattern); +} + +function isBroadPositivePattern(pattern: string): boolean { + return pattern === '' || pattern === '*' || pattern === '**' || pattern === '**/*'; +} + +/** + * Compile a glob matcher for the user's pattern, using the same semantics + * ripgrep's `--glob` would have applied: + * - Patterns without `/` match the basename at any depth. + * - Patterns with `/` match the relative path from the search root. + * - A leading `/` is stripped — ripgrep treats `/src/*.ts` as rooted at + * the search root, equivalent to `src/*.ts`. + * - `**` matches zero or more directory levels. + * - Dotfiles are matched (rg runs with `--hidden`). + * - Matching is case-sensitive, matching ripgrep's default (use + * `--glob-case-insensitive` / `--iglob` for case-insensitive mode). + * - Brace expansion (`*.{ts,tsx}`) is handled by picomatch natively. + * + * Returns a function that takes a relative path and returns whether it + * matches. The picomatch matcher is compiled once so large trees don't + * reparse the pattern on every line. + */ +function compileGlobMatcher(pattern: string): (relPath: string) => boolean { + const opts = { dot: true }; + const preprocessed = preprocessGitignoreGlobPattern(pattern); + if (preprocessed === undefined) return () => true; + let normalizedPattern = preprocessed; + // rg treats a leading `!` as a glob exclusion marker: `!(a).ts` means + // "exclude files matching `(a).ts`". Strip the `!` and negate the + // matcher so the in-process filter excludes those files instead of + // treating `!` as a picomatch extglob prefix. + let negated = false; + if (normalizedPattern.startsWith('!')) { + negated = true; + normalizedPattern = normalizedPattern.slice(1); + } + if (negated && normalizedPattern === '') { + return () => false; + } + const rooted = normalizedPattern.startsWith('/'); + if (rooted) { + normalizedPattern = normalizedPattern.slice(1); + } + // Strip a leading `./` — rg's glob subject never has it, and picomatch + // treats `./` as optional, which would broaden matches. Normalizing both + // sides to the un-prefixed form keeps the in-process filter consistent. + if (normalizedPattern.startsWith('./')) { + normalizedPattern = normalizedPattern.slice(2); + } + // Escape picomatch-only extensions that rg --glob does not support, + // so the in-process matcher matches the same files rg would. + const escapedPattern = escapeForPicomatch(normalizedPattern); + // A pattern without `/` matches the basename at any depth — unless the + // original pattern was rooted with a leading `/`, in which case it + // matches only at the search root (gitignore-style rooted globs). + let matcher: (relPath: string) => boolean; + if (!normalizedPattern.includes('/') && !rooted) { + const fn = picomatch(escapedPattern, opts); + matcher = (relPath: string) => fn(relPath.split('/').pop()!); + } else { + const fn = picomatch(escapedPattern, opts); + matcher = (relPath: string) => fn(relPath); + } + return negated ? (relPath: string) => !matcher(relPath) : matcher; +} + +function preprocessGitignoreGlobPattern(pattern: string): string | undefined { + if (pattern.startsWith('#')) return undefined; + let end = pattern.length; + while (end > 0 && pattern[end - 1] === ' ' && !isEscaped(pattern, end - 1)) { + end--; + } + return pattern.slice(0, end); +} + +function isEscaped(pattern: string, index: number): boolean { + let slashCount = 0; + for (let i = index - 1; i >= 0 && pattern[i] === '\\'; i--) { + slashCount++; + } + return slashCount % 2 === 1; +} + +/** + * Escape picomatch-only glob extensions that rg --glob does not support, + * so the in-process matcher matches the same files rg would. + * + * This is a single-pass, character-class-aware scanner that handles: + * + * - **Extglob** (`[@!+](...)`): rg treats `@`, `!`, `+` as literal chars + * and `(`, `)` as literal. The prefix is wrapped in `[...]` and the + * parens/pipe are escaped. `*` and `?` before `(` are wildcards in rg, + * so only the parens/pipe are escaped. + * - **Bare parenthesis alternation** (`(a|b)`): rg treats `(`, `|`, `)` as + * literal. All three are escaped so picomatch does too. + * - **Range braces** (`{1..2}`): rg treats a single-alternative brace as + * the inner text with braces removed. Reduced to `1..2`. + * - **Brace alternatives** (`{ts,tsx}`): left intact (supported by both). + * Empty alternatives (`{,c}`, `{a,,b}`) are dropped to match rg. + * Range-like arms (`{1..2,3}`) have their dots escaped so picomatch + * treats them as literal, not as a range expansion. + * - **Nested braces** (`{a,{b,c}}`): left intact (both expand them). + * - **Character classes**: inside `[]`, braces and extglob constructs are + * literal class members and are never rewritten. `[!` is converted to + * `[^` (picomatch's negation syntax). `[:` is escaped to `[\:` to + * prevent picomatch from interpreting POSIX classes like `[:digit:]`. + * - **Escaped chars** (`\x`): gitignore removes escapes before ordinary + * characters; escapes before picomatch syntax are preserved as literals. + */ +function escapeForPicomatch(pattern: string): string { + let result = ''; + let i = 0; + let inBracket = false; + let bracketContentStart = -1; + while (i < pattern.length) { + const ch = pattern[i]!; + + // --- Inside a character class --- + if (inBracket) { + result += ch; + if (ch === '\\' && i + 1 < pattern.length) { + result += pattern[i + 1]!; + i += 2; + continue; + } + if (ch === ']') { + // A `]` at the first content position is a literal, not the + // terminator (mirrors validateGlobPattern logic). + if (i !== bracketContentStart) inBracket = false; + } + // Escape `:` after `[` inside a char class to prevent picomatch + // from interpreting `[:class:]` as a POSIX character class. + if (ch === ':' && result.length >= 2 && result.at(-2) === '[') { + // Replace the just-added `:` with `\:` + result = result.slice(0, -1) + '\\:'; + } + i++; + continue; + } + + // --- Escaped character (outside char class) --- + if (ch === '\\' && i + 1 < pattern.length) { + const escaped = pattern[i + 1]!; + result += shouldPreserveEscapeForPicomatch(escaped) ? ch + escaped : escaped; + i += 2; + continue; + } + + // --- Character class opening --- + if (ch === '[') { + inBracket = true; + const next = pattern[i + 1]; + bracketContentStart = next === '!' || next === '^' ? i + 2 : i + 1; + // Convert `[!` to `[^` — picomatch uses `[^` for negation, while rg + // (gitignore semantics) uses `[!`. + if (next === '!') { + result += '[^'; + i += 2; + continue; + } + result += ch; + i++; + continue; + } + + // --- Opening parenthesis (extglob or bare alternation) --- + if (ch === '(') { + // Find the matching `)`, respecting escaped chars. + let depth = 1; + let j = i + 1; + while (j < pattern.length && depth > 0) { + const cj = pattern[j]!; + if (cj === '\\' && j + 1 < pattern.length) { + j += 2; + continue; + } + if (cj === '(') depth++; + else if (cj === ')') depth--; + if (depth > 0) j++; + } + if (depth !== 0) { + // Unclosed — treat `(` as literal (validator will catch it). + result += '\\('; + i++; + continue; + } + const inner = pattern.slice(i + 1, j); + const escapedInner = escapePicomatchParenthesisInner(inner); + // Check if the previous char in result is an extglob prefix. + const prev = result.length > 0 ? result.at(-1) : ''; + if (prev === '@' || prev === '!' || prev === '+') { + // Extglob: replace the prefix with [prefix] and escape parens. + result = result.slice(0, -1) + `[${prev}]`; + } + // For `*` and `?` prefixes, the wildcard is preserved and only + // the parens/pipe are escaped. For bare parens (no extglob prefix), + // also just escape the parens/pipe. + result += `\\(${escapedInner}\\)`; + i = j + 1; + continue; + } + + // --- Brace group --- + if (ch === '{') { + // Scan forward to the matching `}`, respecting escaped chars. + let depth = 1; + let j = i + 1; + while (j < pattern.length && depth > 0) { + const cj = pattern[j]!; + if (cj === '\\' && j + 1 < pattern.length) { + j += 2; + continue; + } + if (cj === '{') depth++; + else if (cj === '}') depth--; + if (depth > 0) j++; + } + if (depth !== 0) { + // Unclosed brace — keep as-is (validator will catch it). + result += ch; + i++; + continue; + } + const inner = pattern.slice(i + 1, j); + // Nested brace groups are left intact (both rg and picomatch + // expand them). + if (inner.includes('{') || inner.includes('}')) { + result += pattern.slice(i, j + 1); + i = j + 1; + continue; + } + if (inner.includes(',')) { + // Split on unescaped commas only — `\,` is a literal comma arm, + // not a separator. rg treats `{\,,a}.ts` as matching both `,.ts` + // and `a.ts`. + const arms = splitBraceArms(inner); + const nonEmpty = arms.filter((a) => a !== ''); + if (nonEmpty.length === 0) { + // All alternatives empty — rg strips to empty string. + } else if (nonEmpty.length === 1) { + // Single non-empty arm — braces removed. + result += escapeRangeArms(nonEmpty[0]!); + } else { + // Multiple arms — escape range-like arms (containing `..`) so + // picomatch treats them as literal, not as a range expansion. + const escaped = nonEmpty.map(escapeRangeArms); + result += `{${escaped.join(',')}}`; + } + } else { + // Single alternative — rg strips the braces. + result += inner; + } + i = j + 1; + continue; + } + + result += ch; + i++; + } + return result; +} + +function shouldPreserveEscapeForPicomatch(ch: string): boolean { + return '*?[]{}()!+@,|\\'.includes(ch); +} + +function escapePicomatchParenthesisInner(inner: string): string { + let result = ''; + for (let i = 0; i < inner.length; i++) { + const ch = inner[i]!; + if (ch === '\\' && i + 1 < inner.length) { + result += ch + inner[i + 1]!; + i++; + continue; + } + result += ch === '(' || ch === ')' || ch === '|' ? `\\${ch}` : ch; + } + return result; +} + +/** + * Escape dots in a range-like brace arm (e.g. `1..2`) so picomatch treats + * them as literal characters, not as a range expansion. rg treats `{1..2}` + * as the literal string `1..2`, so inside a comma brace group each arm + * that contains `..` must have its dots escaped. Uses `[.]` instead of + * `\.` because picomatch's brace expansion strips backslashes before + * matching, but preserves bracket classes. + */ +function escapeRangeArms(arm: string): string { + if (!arm.includes('..')) return arm; + return arm.replaceAll('.', '[.]'); +} + +/** + * Split a brace group's inner text on unescaped commas only. A `\,` is a + * literal comma within an arm, not a separator — rg treats `{\,,a}` as + * two arms: `\,` (literal comma) and `a`. + */ +function splitBraceArms(inner: string): string[] { + const arms: string[] = []; + let current = ''; + for (let k = 0; k < inner.length; k++) { + const ck = inner[k]!; + if (ck === '\\' && k + 1 < inner.length) { + current += ck + inner[k + 1]!; + k++; + continue; + } + if (ck === ',') { + arms.push(current); + current = ''; + continue; + } + current += ck; + } + arms.push(current); + return arms; +} + +/** + * Filter absolute paths from `rg --files` against the user's positive glob + * pattern. Broad patterns (star, double-star, star-slash-star) match + * everything, so the filter is skipped for those. Returns the filtered list + * in the same order. On Windows, the search root and absolute paths are + * case-normalized only to compute the relative path boundary — the original- + * case relative path is then used for case-sensitive pattern matching. + */ +function filterByPattern( + absPaths: string[], + searchRoot: string, + pattern: string, + pathClass: PathClass, +): string[] { + if (isBroadPattern(pattern)) return absPaths; + const matches = compileGlobMatcher(pattern); + const result: string[] = []; + for (const absPath of absPaths) { + const rel = relativePath(searchRoot, absPath, pathClass); + if (rel === undefined) continue; + if (matches(rel)) result.push(absPath); + } + return result; +} + +/** + * Compute the relative path from `searchRoot` to `absPath`, preserving the + * original casing of `absPath`. On Windows, the root and path are compared + * case-insensitively to find the boundary, but the returned relative path + * keeps the original case so case-sensitive glob matching works correctly. + * Returns `undefined` if `absPath` is not under `searchRoot`. + */ +function relativePath(searchRoot: string, absPath: string, pathClass: PathClass): string | undefined { + const normAbs = normalize(absPath); + const normRoot = normalize(searchRoot); + if (pathClass !== 'win32') { + const rel = relative(normRoot, normAbs); + // Only reject paths that actually escape the root: `..` (the parent + // itself) or `../…` (something inside the parent). A file whose name + // starts with two dots — e.g. `..config/a.ts` — is under the root and + // must be kept. + return rel === '..' || rel.startsWith('../') ? undefined : rel; + } + const lowerAbs = normAbs.toLowerCase(); + const lowerRoot = normRoot.toLowerCase(); + if (lowerAbs === lowerRoot) return '.'; + const prefix = lowerRoot.endsWith('/') ? lowerRoot : lowerRoot + '/'; + if (!lowerAbs.startsWith(prefix)) return undefined; + return normAbs.slice(prefix.length); +} + +/** + * Build a streaming line filter for `runRipgrepOnce` that applies the user's + * glob pattern to each path line before it counts toward the output cap. + * Returns `undefined` for broad patterns (no filtering needed) so the + * original byte-level cap path is used. + * + * rg runs with cwd pinned to the search root, so each line is normally a + * relative path like `./src/a.ts` (POSIX) or `.\src\a.ts` (Windows). The + * filter strips the leading `./` or `.\` and normalizes backslashes to + * forward slashes on Windows before matching. If the line is an absolute + * path (e.g. from a test mock or an external root), it is made relative to + * the search root first, preserving original case for pattern matching. + */ +function makeLineFilter( + pattern: string, + pathClass: PathClass, + searchRoot: string, +): ((line: string) => boolean) | undefined { + if (isBroadPattern(pattern)) return undefined; + const matches = compileGlobMatcher(pattern); + return (line: string): boolean => { + let relPath = line; + if (pathClass === 'win32') relPath = relPath.replaceAll('\\', '/'); + if (relPath.startsWith('./') || relPath.startsWith('.\\')) { + relPath = relPath.slice(2); + } else if (isAbsolute(relPath)) { + const rel = relativePath(searchRoot, relPath, pathClass); + if (rel === undefined) return false; + relPath = rel; + } + return matches(relPath); + }; +} + +/** + * Validate a glob pattern for common malformed syntax. Returns an error + * message if the pattern is invalid, or `undefined` if it is well-formed. + * This mirrors ripgrep's globset parser, which rejects these with a hard + * error — picomatch would silently treat them as literals and report "No + * matches found" instead. + * + * Detected errors: + * - Unclosed `[` or `{` (balanced tracking). + * - Empty character class: `[]`, `[!]`, `[^]` — a `]` right after the + * opening bracket prefix (`[`, `[!`, `[^`) is a literal `]`, not the + * terminator, so the class is unclosed. + * - Invalid range: `[z-a]` where the start character is greater than the + * end character. + * - Dangling backslash: a trailing `\` with no character to escape. + * + * Braces inside a character class (`[{]foo`) are literal characters and do + * not count toward brace depth. + */ +function validateGlobPattern(pattern: string): string | undefined { + let inBracket = false; + let bracketDepth = 0; + let braceDepth = 0; + // Position of the first content character inside the current bracket + // (after `[`, `[!`, or `[^`). A `]` at this position is a literal, not + // the class terminator. + let bracketContentStart = -1; + // Last unescaped character seen inside the current bracket, for range + // validation. Reset to '' on bracket open or after an escape. + let bracketPrevChar = ''; + for (let i = 0; i < pattern.length; i++) { + const ch = pattern[i]!; + if (ch === '\\') { + if (i === pattern.length - 1) { + return `Invalid glob pattern: dangling '\\' in "${pattern}"`; + } + i++; + bracketPrevChar = ''; + continue; + } + if (inBracket) { + if (ch === ']') { + if (i === bracketContentStart) { + // Literal ] — the class is still open. + bracketPrevChar = ']'; + continue; + } + inBracket = false; + bracketDepth--; + bracketPrevChar = ''; + } else if (ch === '-' && bracketPrevChar !== '' && bracketPrevChar !== '-') { + const nextCh = pattern[i + 1]; + if (nextCh !== undefined && nextCh !== ']' && nextCh !== '\\') { + if (bracketPrevChar > nextCh) { + return `Invalid glob pattern: invalid range '${bracketPrevChar}-${nextCh}' in "${pattern}"`; + } + } + } else { + bracketPrevChar = ch; + } + } else if (ch === '[') { + inBracket = true; + bracketDepth++; + const next = pattern[i + 1]; + bracketContentStart = next === '!' || next === '^' ? i + 2 : i + 1; + bracketPrevChar = ''; + } else if (ch === '{') { + braceDepth++; + } else if (ch === '}') { + if (braceDepth > 0) { + braceDepth--; + } else { + return `Invalid glob pattern: unopened '}' in "${pattern}"`; + } + } + } + if (bracketDepth > 0) return `Invalid glob pattern: unclosed '[' in "${pattern}"`; + if (braceDepth > 0) return `Invalid glob pattern: unclosed '{' in "${pattern}"`; + return undefined; +} + +async function isInsideGitRepo(kaos: Kaos, searchRoot: string): Promise { + let current = kaos.normpath(searchRoot); + for (;;) { + if (await pathExists(kaos, join(current, '.git'))) return true; + const parent = dirname(current); + if (parent === current) return false; + current = parent; + } +} + +async function pathExists(kaos: Kaos, path: string): Promise { + try { + await kaos.stat(path); + return true; + } catch { + return false; + } +} + function formatGlobError(searchRoot: string, stderr: string): string { const trimmed = stderr.trim(); if (/no such file or directory/i.test(trimmed)) { diff --git a/packages/agent-core/src/tools/support/run-rg.ts b/packages/agent-core/src/tools/support/run-rg.ts index f8713cb02..510d2157f 100644 --- a/packages/agent-core/src/tools/support/run-rg.ts +++ b/packages/agent-core/src/tools/support/run-rg.ts @@ -58,6 +58,15 @@ export type RipgrepRunOutcome = export interface RunRipgrepOptions { /** Message surfaced when the run is aborted via `signal`. Defaults to `"Aborted"`. */ readonly abortedMessage?: string; + /** + * Optional line-level filter applied to stdout as lines stream in. When + * provided, only lines that pass the filter count toward the output cap, + * so a selective pattern can't be starved by a large prefix of non-matching + * lines (e.g. `rg --files` returning >10MB of paths before the user's glob + * matches). Lines are split on `\n`; the filter receives the line without + * the trailing newline. + */ + readonly lineFilter?: (line: string) => boolean; } async function disposeProcess(proc: KaosProcess): Promise { @@ -75,6 +84,7 @@ export async function runRipgrepOnce( options: RunRipgrepOptions = {}, ): Promise { const abortedMessage = options.abortedMessage ?? 'Aborted'; + const lineFilter = options.lineFilter; if (signal.aborted) { return { kind: 'tool-error', result: { isError: true, output: abortedMessage } }; } @@ -165,7 +175,7 @@ export async function runRipgrepOnce( try { const isTerminating = (): boolean => timedOut || aborted || killed; const [stdoutResult, stderrResult, code] = await Promise.all([ - readStreamWithCap(proc.stdout, MAX_OUTPUT_BYTES, isTerminating), + readStreamWithCap(proc.stdout, MAX_OUTPUT_BYTES, isTerminating, lineFilter), readStreamWithCap(proc.stderr, MAX_OUTPUT_BYTES, isTerminating), proc.wait(), ]); @@ -229,29 +239,64 @@ async function readStreamWithCap( stream: Readable, maxBytes: number, suppressPrematureClose?: () => boolean, + lineFilter?: (line: string) => boolean, ): Promise { const chunks: Buffer[] = []; let total = 0; let truncated = false; + // Use a streaming TextDecoder so multibyte UTF-8 sequences split across + // chunk boundaries are decoded correctly instead of producing replacement + // characters. + const decoder = lineFilter ? new TextDecoder('utf8', { fatal: false }) : undefined; + let lineBuffer = ''; try { for await (const chunk of stream) { const buf: Buffer = typeof chunk === 'string' ? Buffer.from(chunk, 'utf8') : (chunk as Buffer); if (truncated) continue; - if (total + buf.length > maxBytes) { - const remaining = maxBytes - total; - if (remaining > 0) chunks.push(buf.subarray(0, remaining)); - total = maxBytes; - truncated = true; - continue; + if (lineFilter && decoder) { + lineBuffer += decoder.decode(buf, { stream: true }); + const lines = lineBuffer.split('\n'); + lineBuffer = lines.pop()!; + for (const line of lines) { + if (!lineFilter(line)) continue; + const lineBytes = Buffer.from(line + '\n', 'utf8'); + if (total + lineBytes.length > maxBytes) { + truncated = true; + break; + } + chunks.push(lineBytes); + total += lineBytes.length; + } + } else { + if (total + buf.length > maxBytes) { + const remaining = maxBytes - total; + if (remaining > 0) chunks.push(buf.subarray(0, remaining)); + total = maxBytes; + truncated = true; + continue; + } + chunks.push(buf); + total += buf.length; } - chunks.push(buf); - total += buf.length; } } catch (error) { if (!isPrematureCloseError(error) || suppressPrematureClose?.() !== true) { throw error; } } + if (!truncated && lineBuffer.length > 0) { + // Flush any remaining bytes from the decoder. + if (decoder) lineBuffer += decoder.decode(); + if (!lineFilter || lineFilter(lineBuffer)) { + const lineBytes = Buffer.from(lineBuffer, 'utf8'); + if (total + lineBytes.length > maxBytes) { + truncated = true; + } else { + chunks.push(lineBytes); + total += lineBytes.length; + } + } + } return { text: Buffer.concat(chunks).toString('utf8'), truncated }; } diff --git a/packages/agent-core/test/tools/glob.test.ts b/packages/agent-core/test/tools/glob.test.ts index dbe0119a5..7622f87e4 100644 --- a/packages/agent-core/test/tools/glob.test.ts +++ b/packages/agent-core/test/tools/glob.test.ts @@ -162,7 +162,7 @@ describe('GlobTool', () => { additionalDirs: [], }); - const result = await executeTool(tool, context({ pattern: 'src/**/*.ts', path: 'C:\\WORKSPACE' })); + const result = await executeTool(tool, context({ pattern: 'src/**/*.ts', path: 'C:\\workspace' })); // pathe.normalize renders Windows paths with forward slashes, so the // relativized result keeps `/` regardless of the backend path class. @@ -183,31 +183,37 @@ describe('GlobTool', () => { expect(result.output).toContain(`[Truncated at ${String(MAX_MATCHES)} matches`); }); - it('passes a brace pattern through to a single rg --glob', async () => { - const exec = execReturning('/workspace/a.ts\n/workspace/shared.ts\n/workspace/shared.tsx\n'); + it('filters brace patterns in-process without passing them as a positive --glob', async () => { + const exec = execReturning( + '/workspace/a.ts\n/workspace/shared.ts\n/workspace/shared.tsx\n/workspace/b.js\n', + ); const tool = new GlobTool(kaosWithExec(exec), workspace); const result = await executeTool(tool, context({ pattern: '*.{ts,tsx}' })); expect(result.isError).toBeFalsy(); - expect(execArgs(exec)).toContain('*.{ts,tsx}'); + // The pattern must NOT be passed as a positive --glob — that would + // override ignore-file logic. Filtering happens in-process. + expect(execArgs(exec)).not.toContain('*.{ts,tsx}'); expect(result.output).toContain('a.ts'); expect(result.output).toContain('shared.ts'); expect(result.output).toContain('shared.tsx'); + expect(result.output).not.toContain('b.js'); }); - it('passes an escaped-brace pattern through unchanged so literal-brace files stay matchable', async () => { + it('matches an escaped-brace pattern in-process so literal-brace files stay matchable', async () => { // `\{a,b\}.ts` opts out of brace expansion — the user wants a file - // literally named `{a,b}.ts`. The pattern must reach rg with the escapes - // intact (the tool must not strip or reinterpret the backslashes). - const exec = execReturning('/workspace/{a,b}.ts\n'); + // literally named `{a,b}.ts`. The pattern is matched in-process, so the + // escapes are handled by picomatch, not rg. + const exec = execReturning('/workspace/{a,b}.ts\n/workspace/other.ts\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); const result = await executeTool(tool, context({ pattern: '\\{a,b\\}.ts' })); expect(result.isError).toBeFalsy(); - expect(execArgs(exec)).toContain('\\{a,b\\}.ts'); + expect(execArgs(exec)).not.toContain('\\{a,b\\}.ts'); expect(result.output).toContain('{a,b}.ts'); + expect(result.output).not.toContain('other.ts'); }); it('searches only the current workspace when path is omitted', async () => { @@ -229,6 +235,8 @@ describe('GlobTool', () => { const result = await executeTool(tool, context({ pattern: 'pkg/**/*.ts', path: '/extra' })); expect(result.output).toBe('/extra/pkg/a.ts'); + // The search path is always `.` (pinned to the search root via cwd). + // A derived subdirectory path would override ignore rules in rg. expect(execArgs(exec).at(-1)).toBe('.'); }); @@ -250,6 +258,60 @@ describe('GlobTool', () => { expect(execArgs(exec)).not.toContain('--no-ignore'); }); + it('does not emit a positive --glob for broad all-file patterns', async () => { + for (const pattern of ['*', '**', '**/*'] as const) { + const exec = execReturning('/workspace/a.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + await executeTool(tool, context({ pattern })); + + const args = execArgs(exec); + expect(args).not.toContain(pattern); + expect(args).toContain('--glob'); + expect(args.some((arg) => arg.startsWith('!'))).toBe(true); + } + }); + + it('treats an empty pattern as a broad all-files glob, matching rg --glob', async () => { + // rg treats -g '' as matching all files (respecting ignores). picomatch + // throws on an empty string, so the tool must short-circuit before + // compiling the matcher. + const exec = execReturning('/workspace/a.ts\n/workspace/b.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '', path: '/workspace' })); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('a.ts'); + expect(result.output).toContain('b.ts'); + }); + + it('filters anchored patterns in-process without a positive --glob', async () => { + const exec = execReturning('/workspace/src/a.ts\n/workspace/other/b.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: 'src/**/*.ts' })); + + expect(execArgs(exec)).not.toContain('src/**/*.ts'); + expect(result.output).toContain('src/a.ts'); + expect(result.output).not.toContain('other/b.ts'); + }); + + it('adds --no-require-git when the search root is outside a git repo', async () => { + const exec = execReturning('/workspace/a.ts\n'); + const stat = vi.fn(async (candidate: string) => { + if (candidate.endsWith('/.git')) { + throw Object.assign(new Error('ENOENT: no such file or directory'), { code: 'ENOENT' }); + } + return dirStat(); + }); + const tool = new GlobTool(createFakeKaos({ exec, stat }), workspace); + + await executeTool(tool, context({ pattern: '*.ts', path: '/workspace' })); + + expect(execArgs(exec)).toContain('--no-require-git'); + }); + it('caps returned matches and surfaces the truncation header', async () => { const stdout = Array.from({ length: MAX_MATCHES + 1 }, (_, i) => `/workspace/${String(i)}.ts`).join('\n') + @@ -291,7 +353,7 @@ describe('GlobTool', () => { }); it('filters sensitive files from results', async () => { - const exec = execReturning('/workspace/.env\n/workspace/src/a.ts\n'); + const exec = execReturning('/workspace/src/.env\n/workspace/src/a.ts\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); const result = await executeTool(tool, context({ pattern: 'src/**' })); @@ -358,15 +420,16 @@ describe('GlobTool', () => { }); it('walks "**/" prefix patterns with a literal anchor', async () => { - const exec = execReturning('/workspace/a.py\n/workspace/sub/b.py\n'); + const exec = execReturning('/workspace/a.py\n/workspace/sub/b.py\n/workspace/other.txt\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); const result = await executeTool(tool, context({ pattern: '**/*.py' })); expect(result.isError).toBeFalsy(); - expect(execArgs(exec)).toContain('**/*.py'); + expect(execArgs(exec)).not.toContain('**/*.py'); expect(result.output).toContain('a.py'); expect(result.output).toContain('sub/b.py'); + expect(result.output).not.toContain('other.txt'); }); it('walks safe recursive patterns with a literal subdirectory anchor', async () => { @@ -402,6 +465,28 @@ describe('GlobTool', () => { expect(result.output).toContain('No matches found'); }); + it('filters the pattern before the stdout cap so rare matches are not starved', async () => { + // Simulate >10MB of non-matching paths followed by a matching path. + // Without the streaming line filter, the cap would truncate before the + // match and the tool would report "No matches found". + const nonMatching = Array.from( + { length: 200_000 }, + (_, i) => `/workspace/noise_${String(i)}.txt`, + ).join('\n'); + const stdout = nonMatching + '\n/workspace/rare/deep/match.ts\n'; + const exec = execReturning(stdout); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'rare/**/*.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('rare/deep/match.ts'); + expect(result.output).not.toContain('noise_'); + }); + it('keeps complete paths and surfaces a warning when rg exits 2 after traversal errors', async () => { const exec = execReturning( '/workspace/a.ts\n/workspace/src/b.ts\n', @@ -419,14 +504,493 @@ describe('GlobTool', () => { expect(result.output).toContain('Permission denied'); }); - it('keeps ripgrep errors hard failures when no complete path is produced', async () => { - const exec = execReturning('', 'error: invalid glob', 2); + it('rejects malformed glob patterns before running ripgrep', async () => { + const exec = vi.fn(); const tool = new GlobTool(kaosWithExec(exec), workspace); const result = await executeTool(tool, context({ pattern: '[', path: '/workspace' })); expect(result).toMatchObject({ isError: true }); - expect(result.output).toContain('Glob failed: error: invalid glob'); + expect(result.output).toContain('Invalid glob pattern'); + expect(result.output).toContain('unclosed'); + expect(exec).not.toHaveBeenCalled(); + }); + + it('rejects malformed glob patterns with unclosed braces', async () => { + const exec = vi.fn(); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '*.{ts,tsx', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('unclosed'); + expect(exec).not.toHaveBeenCalled(); + }); + + it('rejects empty character classes like []', async () => { + const exec = vi.fn(); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '[]', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('Invalid glob pattern'); + expect(result.output).toContain('unclosed'); + expect(exec).not.toHaveBeenCalled(); + }); + + it('rejects negated empty character classes like [!]', async () => { + const exec = vi.fn(); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '[!]', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('Invalid glob pattern'); + expect(result.output).toContain('unclosed'); + expect(exec).not.toHaveBeenCalled(); + }); + + it('rejects caret empty character classes like [^]', async () => { + const exec = vi.fn(); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '[^]', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('Invalid glob pattern'); + expect(result.output).toContain('unclosed'); + expect(exec).not.toHaveBeenCalled(); + }); + + it('rejects invalid character ranges like [z-a]', async () => { + const exec = vi.fn(); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '[z-a]', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('Invalid glob pattern'); + expect(result.output).toContain('invalid range'); + expect(exec).not.toHaveBeenCalled(); + }); + + it('rejects a dangling trailing backslash', async () => { + const exec = vi.fn(); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: 'foo\\', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('Invalid glob pattern'); + expect(result.output).toContain('dangling'); + expect(exec).not.toHaveBeenCalled(); + }); + + it('accepts brace-in-bracket patterns like [{]foo.ts', async () => { + const exec = execReturning('/workspace/{foo.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '[{]foo.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('{foo.ts'); + }); + + it('accepts bracket-in-bracket patterns like [[]foo.ts', async () => { + const exec = execReturning('/workspace/[foo.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '[[]foo.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('[foo.ts'); + }); + + it('treats extglob syntax as literal, matching rg --glob behavior', async () => { + const exec = execReturning('/workspace/@(a|b).ts\n/workspace/a.ts\n/workspace/b.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '@(a|b).ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('@(a|b).ts'); + expect(result.output).not.toContain('a.ts'); + expect(result.output).not.toContain('b.ts'); + }); + + it('preserves * and ? wildcards before literal parentheses, matching rg --glob', async () => { + // rg treats `*` and `?` as regular wildcards and `(` as a literal + // character — NOT as extglob prefixes. So `*(bar).ts` matches any file + // ending in `(bar).ts` (the `*` matches any prefix), including + // `foo123(bar).ts` which the old `[*]\(bar\).ts` escape would NOT match. + const exec = execReturning( + '/workspace/foo*(bar).ts\n/workspace/foo123(bar).ts\n/workspace/x(bar).ts\n/workspace/(bar).ts\n/workspace/other.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '*(bar).ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('foo*(bar).ts'); + expect(lines).toContain('foo123(bar).ts'); + expect(lines).toContain('x(bar).ts'); + expect(lines).toContain('(bar).ts'); + expect(lines).not.toContain('other.ts'); + }); + + it('preserves ? wildcard before literal parentheses, matching rg --glob', async () => { + // `?(bar).ts` — `?` is a single-char wildcard, `(` is literal. Matches + // `x(bar).ts` (one char) and `?(bar).ts` (literal ?), but not + // `yz(bar).ts` (two chars) or `(bar).ts` (zero chars). + const exec = execReturning( + '/workspace/x(bar).ts\n/workspace/?(bar).ts\n/workspace/yz(bar).ts\n/workspace/(bar).ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '?(bar).ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('x(bar).ts'); + expect(lines).toContain('?(bar).ts'); + expect(lines).not.toContain('yz(bar).ts'); + expect(lines).not.toContain('(bar).ts'); + }); + + it('treats range braces as a single alternative, matching rg --glob behavior', async () => { + // rg treats `{1..2}` as a single brace alternative, removing the braces + // (matching `1..2`, not `{1..2}` and not the range 1..2). picomatch 4.x + // would either expand the range or treat the braces as literal, so the + // in-process matcher collapses single-alternative braces to match rg. + const exec = execReturning( + '/workspace/1..2.ts\n/workspace/{1..2}.ts\n/workspace/1.ts\n/workspace/2.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '{1..2}.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('1..2.ts'); + expect(result.output).not.toContain('{1..2}.ts'); + // `1.ts` and `2.ts` (standalone) should not match — only `1..2.ts`. + // Use line-level checks to avoid substring false positives. + const lines = (result.output as string).split('\n'); + expect(lines).toContain('1..2.ts'); + expect(lines).not.toContain('1.ts'); + expect(lines).not.toContain('2.ts'); + expect(lines).not.toContain('{1..2}.ts'); + }); + + it('preserves braces inside character classes, matching rg --glob behavior', async () => { + // rg treats `[{a}].ts` as a character class containing `{`, `a`, `}` — + // matching `{.ts`, `}.ts`, and `a.ts`, but NOT `{a}.ts`. The brace + // rewrite must not strip braces inside `[]`. + const exec = execReturning( + '/workspace/{.ts\n/workspace/}.ts\n/workspace/a.ts\n/workspace/{a}.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '[{a}].ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('{.ts'); + expect(lines).toContain('}.ts'); + expect(lines).toContain('a.ts'); + expect(lines).not.toContain('{a}.ts'); + }); + + it('drops empty brace alternatives, matching rg --glob behavior', async () => { + // rg drops empty alternatives: `ab{,c}` matches `abc` only, not `ab`. + // picomatch expands the empty arm, so the rewrite must filter it out. + const exec = execReturning('/workspace/ab\n/workspace/abc\n/workspace/abd\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'ab{,c}', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('abc'); + expect(lines).not.toContain('ab'); + }); + + it('drops multiple empty brace alternatives, matching rg --glob behavior', async () => { + // `ab{c,,d}` — rg keeps only `c` and `d`, dropping the empty arm. + const exec = execReturning('/workspace/ab\n/workspace/abc\n/workspace/abd\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'ab{c,,d}', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('abc'); + expect(lines).toContain('abd'); + expect(lines).not.toContain('ab'); + }); + + it('collapses all-empty brace alternatives to the prefix, matching rg', async () => { + // `ab{,}` — all alternatives empty, rg strips to `ab`. + const exec = execReturning('/workspace/ab\n/workspace/abc\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'ab{,}', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('ab'); + expect(lines).not.toContain('abc'); + }); + + it('escapes POSIX bracket classes, matching rg --glob behavior', async () => { + // rg treats `[:` inside `[]` as literal characters, not a POSIX class. + // So `[[:digit:]].ts` matches `d].ts` and `:].ts` (char class `[ : d i g i t ]` + // then literal `]`), but NOT `1.ts`. picomatch would interpret `[:digit:]` + // as a POSIX digit class, so the `:` after `[` is escaped. + const exec = execReturning( + '/workspace/1.ts\n/workspace/a.ts\n/workspace/d].ts\n/workspace/:].ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '[[:digit:]].ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('d].ts'); + expect(lines).toContain(':].ts'); + expect(lines).not.toContain('1.ts'); + }); + + it('skips extglob rewrites inside character classes, matching rg', async () => { + // `[@(a)].ts` — rg treats `@`, `(`, `a`, `)` as literal class members. + // The extglob rewrite must not fire inside `[]`. + const exec = execReturning( + '/workspace/@.ts\n/workspace/(.ts\n/workspace/a.ts\n/workspace/).ts\n/workspace/@(a).ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '[@(a)].ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('@.ts'); + expect(lines).toContain('(.ts'); + expect(lines).toContain('a.ts'); + expect(lines).toContain(').ts'); + expect(lines).not.toContain('@(a).ts'); + }); + + it('escapes range arms inside brace alternatives, matching rg', async () => { + // `{1..2,3}.ts` — rg matches `1..2.ts` and `3.ts`, treating `1..2` as + // a literal arm. picomatch would expand `1..2` as a range (1, 2), so + // the dots are replaced with `[.]` to prevent range expansion. + const exec = execReturning( + '/workspace/1.ts\n/workspace/2.ts\n/workspace/3.ts\n/workspace/1..2.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '{1..2,3}.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('1..2.ts'); + expect(lines).toContain('3.ts'); + expect(lines).not.toContain('1.ts'); + expect(lines).not.toContain('2.ts'); + }); + + it('converts [! to [^ for negated character classes, matching rg', async () => { + // rg (gitignore semantics) uses `[!` for negation; picomatch uses `[^`. + // `[!a].ts` should match `b.ts`, `c.ts`, etc. but NOT `a.ts`. + const exec = execReturning( + '/workspace/a.ts\n/workspace/b.ts\n/workspace/c.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '[!a].ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('b.ts'); + expect(lines).toContain('c.ts'); + expect(lines).not.toContain('a.ts'); + }); + + it('rejects unmatched closing braces, matching rg', async () => { + // rg errors on `a}.ts` — unopened alternate group. + const exec = execReturning('/workspace/a.ts\n/workspace/a}.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'a}.ts', path: '/workspace' }), + ); + + expect(result.isError).toBe(true); + expect(result.output).toContain('unopened'); + }); + + it('strips ./ prefix from glob patterns, matching rg --glob behavior', async () => { + // rg treats `./src/*.ts` as not matching `src/a.ts` because the glob + // subject is `src/a.ts` (no `./` prefix). Stripping `./` from the + // pattern and matching against the un-prefixed relative path keeps + // the in-process filter consistent with rg. + const exec = execReturning('/workspace/src/a.ts\n/workspace/other/b.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: './src/*.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('src/a.ts'); + expect(result.output).not.toContain('other/b.ts'); + }); + + it('escapes bare parenthesis alternation, matching rg --glob behavior', async () => { + // rg treats `(`, `|`, `)` as literal characters. `(a|b).ts` matches + // only the literal filename `(a|b).ts`, not `a.ts` or `b.ts`. + const exec = execReturning( + '/workspace/(a|b).ts\n/workspace/a.ts\n/workspace/b.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '(a|b).ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('(a|b).ts'); + expect(lines).not.toContain('a.ts'); + expect(lines).not.toContain('b.ts'); + }); + + it('always searches from `.` so derived paths cannot override ignore rules', async () => { + const exec = execReturning('/workspace/src/a.ts\n/workspace/other/b.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'src/**/*.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('src/a.ts'); + // The search path is always `.` — a derived subdirectory path would + // override rg's ignore rules (command-line paths are authoritative). + expect(execArgs(exec).at(-1)).toBe('.'); + }); + + it('matches rooted patterns with a leading slash', async () => { + const exec = execReturning('/workspace/src/a.ts\n/workspace/other/b.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '/src/*.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('src/a.ts'); + expect(result.output).not.toContain('other/b.ts'); + }); + + it('rooted basename pattern only matches at the search root', async () => { + const exec = execReturning( + '/workspace/foo.ts\n/workspace/sub/foo.ts\n/workspace/deep/nested/foo.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '/foo.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('foo.ts'); + expect(result.output).not.toContain('sub/foo.ts'); + expect(result.output).not.toContain('deep/nested/foo.ts'); + }); + + it('decodes multibyte filenames split across stream chunks', async () => { + // Split a multibyte filename across two chunks so naive buf.toString + // would produce a replacement character. + const fullLine = '/workspace/src/é.ts\n'; + const fullBuf = Buffer.from(fullLine, 'utf8'); + const splitPoint = fullBuf.indexOf(0xc3); // first byte of é (0xc3 0xa9) + const chunk1 = fullBuf.subarray(0, splitPoint + 1); // splits the multibyte char + const chunk2 = fullBuf.subarray(splitPoint + 1); + const stdoutStream = Readable.from([chunk1, chunk2]); + const exec = vi.fn().mockResolvedValue({ + ...processWithOutput(''), + stdout: stdoutStream, + }); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'src/*.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('src/é.ts'); + expect(result.output).not.toContain('\uFFFD'); + }); + + it('surfaces ripgrep errors when no complete path is produced', async () => { + const exec = execReturning('', 'error: something went wrong', 2); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '*.ts', path: '/workspace' })); + + expect(result).toMatchObject({ isError: true }); + expect(result.output).toContain('Glob failed: error: something went wrong'); }); it('reports "does not exist" when the search directory is missing', async () => { @@ -456,14 +1020,15 @@ describe('GlobTool', () => { }); it('walks "**/" patterns with literal subdirectory anchors after the prefix', async () => { - const exec = execReturning('/workspace/src/main/app.py\n'); + const exec = execReturning('/workspace/src/main/app.py\n/workspace/other/x.py\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); const result = await executeTool(tool, context({ pattern: '**/main/*.py' })); expect(result.isError).toBeFalsy(); - expect(execArgs(exec)).toContain('**/main/*.py'); + expect(execArgs(exec)).not.toContain('**/main/*.py'); expect(result.output).toContain('src/main/app.py'); + expect(result.output).not.toContain('other/x.py'); }); it('matches dotfiles like .gitlab-ci.yml under a simple "*.yml" pattern', async () => { @@ -476,6 +1041,21 @@ describe('GlobTool', () => { expect(result.output).toContain('config.yml'); }); + it('keeps files whose names start with two dots under the search root', async () => { + // A file like `..config/a.ts` is under the root — its relative path + // starts with `..` but is not `..` or `../`. The old `startsWith('..')` + // check would drop it; the fixed check only rejects actual escapes. + const exec = execReturning('/workspace/..config/a.ts\n/workspace/..foo.ts\n/workspace/src/b.ts\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool(tool, context({ pattern: '**/*.ts', path: '/workspace' })); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('..config/a.ts'); + expect(result.output).toContain('..foo.ts'); + expect(result.output).toContain('src/b.ts'); + }); + it('descends into hidden directories under a recursive pattern', async () => { const exec = execReturning('/workspace/src/.config/settings.yml\n'); const tool = new GlobTool(kaosWithExec(exec), workspace); @@ -576,6 +1156,165 @@ describe('GlobTool', () => { expect(tool.description).toContain('C:\\Users\\foo'); expect(tool.description).toContain('/c/Users/foo'); }); + + it('treats leading ! as rg exclusion marker, not extglob prefix', async () => { + // rg --glob '!(a).ts' excludes files matching `(a).ts` and includes + // everything else. The leading `!` is an exclusion marker, not a + // picomatch extglob prefix. + const exec = execReturning( + '/workspace/(a).ts\n/workspace/b.ts\n/workspace/c.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '!(a).ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('b.ts'); + expect(lines).toContain('c.ts'); + expect(lines).not.toContain('(a).ts'); + }); + + it('negated glob with * excludes all matching files', async () => { + // !*.ts should exclude all .ts files and include only non-.ts files. + const exec = execReturning( + '/workspace/a.ts\n/workspace/b.ts\n/workspace/c.js\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '!*.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('c.js'); + expect(lines).not.toContain('a.ts'); + expect(lines).not.toContain('b.ts'); + }); + + it('treats a bare ! as an empty exclusion glob', async () => { + const exec = execReturning('/workspace/a.ts\n/workspace/b.js\n'); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '!', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('No matches'); + expect(result.output).not.toContain('a.ts'); + expect(result.output).not.toContain('b.js'); + }); + + it('preserves rooted exclusions after stripping the exclusion marker', async () => { + const exec = execReturning( + '/workspace/foo.ts\n/workspace/src/foo.ts\n/workspace/bar.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '!/foo.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('src/foo.ts'); + expect(lines).toContain('bar.ts'); + expect(lines).not.toContain('foo.ts'); + }); + + it('drops escapes before ordinary gitignore glob characters', async () => { + const exec = execReturning( + '/workspace/foobar\n/workspace/foo\\bar\n/workspace/foo/bar\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: 'foo\\bar', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('foobar'); + expect(lines).not.toContain('foo\\bar'); + expect(lines).not.toContain('foo/bar'); + }); + + it('escapes nested literal parentheses and pipes like rg glob syntax', async () => { + const exec = execReturning( + '/workspace/(a(b|c)).ts\n/workspace/(ab|c).ts\n/workspace/ac.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '(a(b|c)).ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('(a(b|c)).ts'); + expect(lines).not.toContain('(ab|c).ts'); + expect(lines).not.toContain('ac.ts'); + }); + + it('preprocesses comment and trailing-space globs like gitignore lines', async () => { + const commentExec = execReturning('/workspace/#foo\n/workspace/foo\n'); + const commentTool = new GlobTool(kaosWithExec(commentExec), workspace); + + const commentResult = await executeTool( + commentTool, + context({ pattern: '#foo', path: '/workspace' }), + ); + + expect(commentResult.isError).toBeFalsy(); + expect(commentResult.output).toContain('#foo'); + expect(commentResult.output).toContain('foo'); + + const spaceExec = execReturning('/workspace/foo\n/workspace/foo \n'); + const spaceTool = new GlobTool(kaosWithExec(spaceExec), workspace); + + const spaceResult = await executeTool( + spaceTool, + context({ pattern: 'foo ', path: '/workspace' }), + ); + + expect(spaceResult.isError).toBeFalsy(); + const lines = (spaceResult.output as string).split('\n'); + expect(lines).toContain('foo'); + expect(lines).not.toContain('foo '); + }); + + it('preserves escaped comma as a literal brace arm, matching rg', async () => { + // rg treats `{\,,a}.ts` as two arms: `\,` (literal comma) and `a`. + // It matches `,.ts` and `a.ts`. The naive split on `,` would break + // the escaped comma into an empty arm and corrupt the group. + const exec = execReturning( + '/workspace/,.ts\n/workspace/a.ts\n/workspace/b.ts\n', + ); + const tool = new GlobTool(kaosWithExec(exec), workspace); + + const result = await executeTool( + tool, + context({ pattern: '{\\,,a}.ts', path: '/workspace' }), + ); + + expect(result.isError).toBeFalsy(); + const lines = (result.output as string).split('\n'); + expect(lines).toContain('a.ts'); + // The literal-comma arm matches `,.ts` — picomatch treats `\,` as + // a literal comma after brace expansion. + expect(lines).toContain(',.ts'); + expect(lines).not.toContain('b.ts'); + }); }); describe('splitCompletePaths', () => { @@ -727,4 +1466,138 @@ describe('GlobTool integration (real ripgrep)', () => { await fs.rm(externalDir, { recursive: true, force: true }); } }); + + it('respects .gitignore by default for broad patterns in a git repo', async () => { + await touch('kept.ts', new Date('2024-01-01T00:00:00Z')); + await touch('ignored.log', new Date('2024-01-01T00:00:00Z')); + await fs.writeFile(path.join(tmpDir!, '.gitignore'), '*.log\n'); + await fs.mkdir(path.join(tmpDir!, '.git'), { recursive: true }); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: '*', path: tmpDir! })); + + expect(result.output).toContain('kept.ts'); + expect(result.output).not.toContain('ignored.log'); + }); + + it('respects .gitignore by default in a non-git directory', async () => { + await touch('kept.ts', new Date('2024-01-01T00:00:00Z')); + await touch('ignored.log', new Date('2024-01-01T00:00:00Z')); + await fs.writeFile(path.join(tmpDir!, '.gitignore'), '*.log\n'); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: '*', path: tmpDir! })); + + expect(result.output).toContain('kept.ts'); + expect(result.output).not.toContain('ignored.log'); + }); + + it('respects .gitignore for specific patterns that would re-include ignored files', async () => { + // A positive --glob overrides ignore logic in ripgrep, so + // Glob({ pattern: '*.ts' }) in a repo with .gitignore containing + // *.ts would surface ignored.ts. The in-process filter avoids this + // by letting rg --files enumerate non-ignored files first. + await touch('kept.ts', new Date('2024-01-01T00:00:00Z')); + await touch('ignored.ts', new Date('2024-01-01T00:00:00Z')); + await fs.writeFile(path.join(tmpDir!, '.gitignore'), '*.ts\n'); + await fs.mkdir(path.join(tmpDir!, '.git'), { recursive: true }); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: '*.ts', path: tmpDir! })); + + expect(result.output).toContain('No matches'); + expect(result.output).not.toContain('kept.ts'); + expect(result.output).not.toContain('ignored.ts'); + }); + + it('respects .gitignore for specific patterns in a non-git directory', async () => { + await touch('kept.ts', new Date('2024-01-01T00:00:00Z')); + await touch('ignored.ts', new Date('2024-01-01T00:00:00Z')); + await fs.writeFile(path.join(tmpDir!, '.gitignore'), '*.ts\n'); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: '*.ts', path: tmpDir! })); + + expect(result.output).toContain('No matches'); + expect(result.output).not.toContain('kept.ts'); + expect(result.output).not.toContain('ignored.ts'); + }); + + it('respects .gitignore for anchored patterns pointing at an ignored dir', async () => { + // A pattern like `dist/**/*.js` must not surface files from a gitignored + // `dist/`. Passing the derived prefix `dist` as the rg PATH would override + // ignore rules (command-line paths are authoritative in rg), so the tool + // always searches from `.` and lets the in-process filter narrow results. + await touch('src/a.ts', new Date('2024-01-01T00:00:00Z')); + await touch('dist/bundle.js', new Date('2024-01-01T00:00:00Z')); + await fs.writeFile(path.join(tmpDir!, '.gitignore'), 'dist/\n'); + await fs.mkdir(path.join(tmpDir!, '.git'), { recursive: true }); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: 'dist/**/*.js', path: tmpDir! })); + + expect(result.output).toContain('No matches'); + expect(result.output).not.toContain('dist/bundle.js'); + }); + + it('respects .gitignore for anchored patterns pointing at an ignored dir (include_ignored surfaces them)', async () => { + await touch('src/a.ts', new Date('2024-01-01T00:00:00Z')); + await touch('dist/bundle.js', new Date('2024-01-01T00:00:00Z')); + await fs.writeFile(path.join(tmpDir!, '.gitignore'), 'dist/\n'); + await fs.mkdir(path.join(tmpDir!, '.git'), { recursive: true }); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool( + tool, + context({ pattern: 'dist/**/*.js', path: tmpDir!, include_ignored: true }), + ); + + expect(result.output).toContain('dist/bundle.js'); + }); + + it('returns no matches for an anchored pattern whose prefix does not exist', async () => { + // `src/**/*.ts` in a repo with no `src` must return "No matches", not + // error out with "does not exist" (which happened when the missing + // prefix was passed as the rg PATH). + await touch('other/a.ts', new Date('2024-01-01T00:00:00Z')); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: 'src/**/*.ts', path: tmpDir! })); + + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('No matches'); + }); + + it('matches wildcard-before-paren patterns like rg --glob (integration)', async () => { + // rg treats `*` as a wildcard and `(` as a literal — `*(bar).ts` matches + // any file ending in `(bar).ts` (the `*` matches any prefix). + await touch('foo*(bar).ts', new Date('2024-01-01T00:00:00Z')); + await touch('foo123(bar).ts', new Date('2023-01-01T00:00:00Z')); + await touch('x(bar).ts', new Date('2022-01-01T00:00:00Z')); + await touch('other.ts', new Date('2021-01-01T00:00:00Z')); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: '*(bar).ts', path: tmpDir! })); + + expect(result.output).toContain('foo*(bar).ts'); + expect(result.output).toContain('foo123(bar).ts'); + expect(result.output).toContain('x(bar).ts'); + expect(result.output).not.toContain('other.ts'); + }); + + it('matches range-brace patterns as a single alternative like rg --glob (integration)', async () => { + // rg treats `{1..2}` as a single brace alternative, matching `1..2` + // (braces removed), not `{1..2}` and not the range 1..2. + await touch('1..2.ts', new Date('2024-01-01T00:00:00Z')); + await touch('{1..2}.ts', new Date('2023-01-01T00:00:00Z')); + await touch('1.ts', new Date('2022-01-01T00:00:00Z')); + const tool = new GlobTool(kaos, ws()); + + const result = await executeTool(tool, context({ pattern: '{1..2}.ts', path: tmpDir! })); + + const lines = (result.output as string).split('\n'); + expect(lines).toContain('1..2.ts'); + expect(lines).not.toContain('{1..2}.ts'); + expect(lines).not.toContain('1.ts'); + }); });