diff --git a/.changeset/redact-secrets-deep-review-followups.md b/.changeset/redact-secrets-deep-review-followups.md new file mode 100644 index 0000000000..0a5f04ab64 --- /dev/null +++ b/.changeset/redact-secrets-deep-review-followups.md @@ -0,0 +1,32 @@ +--- +'@hyperdx/api': patch +--- + +fix(api): tighten redactSecrets after deep-review on #2188 + +Several security/correctness gaps surfaced by deep-review across +two passes on the original redactSecrets PR. + +- The `bearer` value alphabet is now `\S+`. Real-world payloads + carry plenty of opaque non-JWT bearers with `:`, `%`, or quote + chars in them, and any alphabet narrower than `\S+` leaks the + suffix past `[REDACTED]`. RFC 6750's b64token alphabet is a + strict subset of `\S+`. (Same fix subsumes the earlier change + that added `_` to cover JWT signatures.) +- The `basic-auth-url` scheme allowlist now covers + http(s) / ws(s) / ftp / sftp / ssh / postgres(ql) / mysql / + mariadb / mongodb(+srv) / mssql / sqlserver / snowflake / + redis(s) / amqp(s) / kafka(+ssl) / clickhouse / smtp(s) / + ldap(s) / nats. The match is also case-insensitive (RFC 3986 + declares schemes case-insensitive), so `HTTPS://user:pw@host` + no longer bypasses redaction. +- The `llm-vendor-key` pattern now catches OpenAI ("sk-..."), + Anthropic ("sk-ant-..."), and Google Gemini ("AIza..." with 35 + trailing chars). Without Gemini coverage, a Gemini API key in + an observability payload would be exfiltrated to the very + provider that issued it. + +Docstring scopes the redactor explicitly to LLM input. Tests +cover each new shape, the JWT-with-underscore regression, the +opaque-bearer-with-`:` / `%` regressions, the uppercase-scheme +bypass, and the Gemini key shape. diff --git a/packages/api/src/utils/__tests__/redactSecrets.test.ts b/packages/api/src/utils/__tests__/redactSecrets.test.ts index 97589b6448..f829a7a05a 100644 --- a/packages/api/src/utils/__tests__/redactSecrets.test.ts +++ b/packages/api/src/utils/__tests__/redactSecrets.test.ts @@ -110,6 +110,48 @@ describe('redactSecrets', () => { ); }); + it('redacts a JWT-shaped Bearer with underscores in the signature', () => { + // base64url uses both "_" and "-"; a real JWT signature routinely + // contains underscores. \S+ trivially covers them, but historically + // the alphabet excluded "_" and the signature tail leaked past the + // [REDACTED] marker. + const line = + 'Authorization: Bearer eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxIn0.AbC_DeF_GhIjKl'; + const out = redactSecrets(line); + expect(out).toContain('Bearer [REDACTED]'); + expect(out).not.toContain('AbC_DeF_GhIjKl'); + expect(out).not.toContain('GhIjKl'); + }); + + it('redacts an opaque non-JWT bearer that contains a colon', () => { + // Opaque tokens (non-RFC-6750) sometimes use ":" as an internal + // delimiter. An alphabet that excludes ":" terminates the match + // early and leaks the suffix past [REDACTED]. + const out = redactSecrets( + 'Authorization: Bearer abc:def_real-secret-tail', + ); + expect(out).toContain('Authorization: Bearer [REDACTED]'); + expect(out).not.toContain('def_real-secret-tail'); + expect(out).not.toContain('secret-tail'); + }); + + it('redacts a bearer that contains percent-encoded bytes', () => { + // "%" is outside RFC 6750's b64token alphabet but is a legal byte + // inside an opaque bearer. \S+ covers it. + const out = redactSecrets('Authorization: Bearer token%20with%20encoded'); + expect(out).toContain('Authorization: Bearer [REDACTED]'); + expect(out).not.toContain('token%20with%20encoded'); + expect(out).not.toContain('encoded'); + }); + + it('stops a bearer match at whitespace', () => { + // Two tokens separated by whitespace must not collapse into one + // greedy match. Only the first non-whitespace run is the bearer. + const out = redactSecrets('auth: Bearer abc123 trailing-noise'); + expect(out).toContain('Bearer [REDACTED]'); + expect(out).toContain('trailing-noise'); + }); + it('redacts Basic values', () => { expect(redactSecrets('Authorization: Basic dXNlcjpwYXNz')).toContain( 'Basic [REDACTED]', @@ -241,6 +283,100 @@ describe('redactSecrets', () => { const line = 'fetch ssh://git@github.com/acme/repo'; expect(redactSecrets(line)).toBe(line); }); + + it('redacts user:pass in a postgres connection string', () => { + const out = redactSecrets( + 'DATABASE_URL=postgres://app:hunter2@db.internal:5432/orders', + ); + expect(out).toContain( + 'postgres://[REDACTED]:[REDACTED]@db.internal:5432/orders', + ); + expect(out).not.toContain('hunter2'); + }); + + it('redacts user:pass in a postgresql connection string', () => { + const out = redactSecrets( + 'jdbc:postgresql://app:hunter2@db.internal:5432/orders', + ); + expect(out).toContain( + 'postgresql://[REDACTED]:[REDACTED]@db.internal:5432/orders', + ); + expect(out).not.toContain('hunter2'); + }); + + it('redacts user:pass in a mongodb+srv connection string', () => { + const out = redactSecrets( + 'mongo: mongodb+srv://svc:hunter2@cluster.mongodb.net/db', + ); + expect(out).toContain( + 'mongodb+srv://[REDACTED]:[REDACTED]@cluster.mongodb.net/db', + ); + expect(out).not.toContain('hunter2'); + }); + + it('redacts user:pass in mysql, redis, amqp, kafka, clickhouse', () => { + const cases = [ + ['mysql://app:hunter2@db.local/orders', 'mysql'], + ['rediss://default:hunter2@cache.local:6379/0', 'rediss'], + ['amqps://app:hunter2@rabbit.local:5671/vh', 'amqps'], + ['kafka://app:hunter2@broker.local:9092/topic', 'kafka'], + ['clickhouse://default:hunter2@ch.local:9000/default', 'clickhouse'], + ]; + for (const [input, scheme] of cases) { + const out = redactSecrets(input); + expect(out.startsWith(`${scheme}://[REDACTED]:[REDACTED]@`)).toBe(true); + expect(out).not.toContain('hunter2'); + } + }); + + it('redacts user:pass across the extended scheme allowlist', () => { + // Schemes added to keep up with connection strings that show + // up in observability payloads: SQL Server (both spellings), + // Snowflake, Kafka with TLS, WebSocket, mail, SFTP, LDAP, + // NATS, MariaDB. + const cases: Array<[string, string]> = [ + ['mssql://app:hunter2@db.local:1433/orders', 'mssql'], + ['sqlserver://app:hunter2@db.local:1433/orders', 'sqlserver'], + [ + 'snowflake://acct:hunter2@xy12345.snowflakecomputing.com/db', + 'snowflake', + ], + ['kafka+ssl://app:hunter2@broker.local:9093/topic', 'kafka+ssl'], + ['ws://svc:hunter2@ws.local:80/socket', 'ws'], + ['wss://svc:hunter2@ws.local:443/socket', 'wss'], + ['smtp://app:hunter2@mail.local:25', 'smtp'], + ['smtps://app:hunter2@mail.local:465', 'smtps'], + ['sftp://app:hunter2@host.local:22/dir', 'sftp'], + ['ldap://app:hunter2@dir.local:389', 'ldap'], + ['ldaps://app:hunter2@dir.local:636', 'ldaps'], + ['nats://app:hunter2@nats.local:4222', 'nats'], + ['mariadb://app:hunter2@db.local:3306/orders', 'mariadb'], + ]; + for (const [input, scheme] of cases) { + const out = redactSecrets(input); + expect(out.startsWith(`${scheme}://[REDACTED]:[REDACTED]@`)).toBe(true); + expect(out).not.toContain('hunter2'); + } + }); + + it('redacts user:pass when the scheme is uppercase', () => { + // RFC 3986 declares schemes case-insensitive. HTTPS:// must + // redact the same as https:// or an attacker can bypass the + // pattern by upcasing the scheme. + const out = redactSecrets('clone HTTPS://alice:hunter2@example.com/repo'); + expect(out).toContain('HTTPS://[REDACTED]:[REDACTED]@example.com/repo'); + expect(out).not.toContain('hunter2'); + }); + + it('redacts user:pass when the scheme is mixed case', () => { + const out = redactSecrets( + 'load Postgres://app:hunter2@db.internal/orders', + ); + expect(out).toContain( + 'Postgres://[REDACTED]:[REDACTED]@db.internal/orders', + ); + expect(out).not.toContain('hunter2'); + }); }); describe('AWS access keys', () => { @@ -311,14 +447,85 @@ describe('redactSecrets', () => { }); }); + describe('LLM vendor API keys', () => { + it('redacts an OpenAI sk- key', () => { + const key = 'sk-' + 'A'.repeat(48); + const out = redactSecrets(`OPENAI_API_KEY=${key}`); + expect(out).toContain('[REDACTED_LLM_KEY]'); + expect(out).not.toContain(key); + }); + + it('redacts an Anthropic sk-ant- key', () => { + const key = 'sk-ant-' + 'B'.repeat(48); + const out = redactSecrets(`ANTHROPIC_API_KEY=${key}`); + expect(out).toContain('[REDACTED_LLM_KEY]'); + expect(out).not.toContain(key); + }); + + it('redacts a free-floating sk- key in a log line', () => { + const key = 'sk-' + 'C'.repeat(40); + const out = redactSecrets(`request body contained ${key} apparently`); + expect(out).toContain('[REDACTED_LLM_KEY]'); + expect(out).not.toContain(key); + }); + + it('does not match short fragments', () => { + // "sk-" alone, or with very few trailing chars, is a common + // English fragment ("sk-ip", "sk-line"). The pattern requires + // at least 20 chars after the prefix. + const line = 'task: sk-ip-the-rest and sk-line are not tokens'; + expect(redactSecrets(line)).toBe(line); + }); + + it('redacts a Google Gemini AIza- key', () => { + // Real Gemini keys are exactly 39 chars: the literal "AIza" + // prefix plus 35 chars from the [A-Za-z0-9_-] alphabet. Without + // a Gemini-specific pattern, a key in an observability payload + // would be exfiltrated to the very provider that issued it. + const key = 'AIza' + 'B'.repeat(35); + const out = redactSecrets(`GEMINI_API_KEY=${key}`); + expect(out).toContain('[REDACTED_LLM_KEY]'); + expect(out).not.toContain(key); + }); + + it('redacts a free-floating Gemini key with mixed alphabet chars', () => { + // 4-char "AIza" prefix + 35 chars from [A-Za-z0-9_-]; the full + // key is exactly 39 chars wide. The trailing alphabet here + // exercises every char class the pattern accepts: alphas of + // both cases, digits, "-", and "_". + const tail = 'SyA1b2C3d4-_E5f6G7h8I9j0K1l2M3n4O5p'; + expect(tail.length).toBe(35); + const key = `AIza${tail}`; + const out = redactSecrets(`note: leaked ${key} in pretrigger`); + expect(out).toContain('[REDACTED_LLM_KEY]'); + expect(out).not.toContain(key); + }); + + it('does not match an AIza prefix with fewer than 35 trailing chars', () => { + // The fixed length is what differentiates a real Gemini key from + // an arbitrary string starting with "AIza". 34 chars is too short + // and must not match. + const tooShort = 'AIza' + 'B'.repeat(34); + expect(redactSecrets(tooShort)).toBe(tooShort); + }); + + it('does not match the bare AIza prefix', () => { + const line = 'note about the AIza prefix used by Google'; + expect(redactSecrets(line)).toBe(line); + }); + }); + describe('multi-secret payloads', () => { it('redacts every distinct secret in one pass', () => { + const llmKey = 'sk-' + 'L'.repeat(48); const input = [ 'conn url: https://alice:hunter2@db.example.com/app', - 'auth: Bearer eyJhbGciOiJIUzI1NiJ9.eyJ1IjoxfQ.sig', + 'pg url: postgres://svc:hunter2@db.internal/orders', + 'auth: Bearer eyJhbGciOiJIUzI1NiJ9.eyJ1IjoxfQ.AbC_DeF_GhIjKl', 'aws: AKIAIOSFODNN7EXAMPLE', 'gh: ghp_' + 'X'.repeat(36), 'slack: xoxb-1111-aaaaaaaaaa', + `openai: ${llmKey}`, 'env: password=hunter2 api_key=abc', ].join('\n'); @@ -327,11 +534,15 @@ describe('redactSecrets', () => { expect(out).not.toContain('hunter2'); expect(out).not.toContain('AKIAIOSFODNN7EXAMPLE'); expect(out).not.toContain('xoxb-1111-aaaaaaaaaa'); + expect(out).not.toContain(llmKey); + expect(out).not.toContain('GhIjKl'); expect(out).toContain('[REDACTED]:[REDACTED]@db.example.com'); + expect(out).toContain('postgres://[REDACTED]:[REDACTED]@db.internal'); expect(out).toContain('Bearer [REDACTED]'); expect(out).toContain('[REDACTED_AWS_KEY]'); expect(out).toContain('[REDACTED_GITHUB_TOKEN]'); expect(out).toContain('[REDACTED_SLACK_TOKEN]'); + expect(out).toContain('[REDACTED_LLM_KEY]'); expect(out).toContain('password=[REDACTED]'); expect(out).toContain('api_key=[REDACTED]'); }); @@ -349,6 +560,7 @@ describe('redactSecrets', () => { 'aws-access-key', 'slack-token', 'github-token', + 'llm-vendor-key', 'key-value', 'json-quoted', 'http-header', diff --git a/packages/api/src/utils/redactSecrets.ts b/packages/api/src/utils/redactSecrets.ts index f364089f65..34b64cb8f9 100644 --- a/packages/api/src/utils/redactSecrets.ts +++ b/packages/api/src/utils/redactSecrets.ts @@ -2,6 +2,12 @@ // originates from observability data (log bodies, span attributes, // pattern samples, alert payloads). // +// **LLM-input only.** Do not use as a general-purpose secret redactor: +// the patterns are tuned for the shapes that show up in observability +// payloads, not for export pipelines, audit logs, or anywhere a missed +// secret has compliance consequences. A redactor for those callers +// needs a different threat model and probably entropy-based scanning. +// // Design rule: any LLM input derived from observability data goes // through redactSecrets before leaving the API process. User-authored // prose (e.g. the chart-builder assistant where the user types their @@ -15,22 +21,35 @@ // // Patterns covered: // pem PEM key blocks (-----BEGIN ... PRIVATE KEY-----) -// basic-auth-url https://user:pass@host +// basic-auth-url scheme://user:pass@host. Schemes: http(s), +// ws(s), ftp, sftp, ssh, postgres(ql), mysql, +// mariadb, mongodb(+srv), mssql, sqlserver, +// snowflake, redis(s), amqp(s), kafka(+ssl), +// clickhouse, smtp(s), ldap(s), nats. Match is +// case-insensitive (RFC 3986 schemes are). // key-value password=secret, api_key=abc // json-quoted {"password":"secret"} and similar // http-header X-Api-Key: abc, Api-Key: abc -// bearer Authorization: Bearer xxx +// bearer Authorization: Bearer xxx. Token is \S+ so +// opaque non-JWT bearers (with ":", "%", or +// quote chars in them) round-trip cleanly. // basic Authorization: Basic xxx // jwt eyJ... three dot-separated base64 segments // aws-access-key AKIA[16 chars], ASIA[16 chars] // slack-token xox[a-z]-... shape // github-token ghp_, gho_, ghu_, ghs_, ghr_ prefixes +// llm-vendor-key sk-... (OpenAI), sk-ant-... (Anthropic), +// AIza... (Google Gemini). This redactor +// specifically fronts an LLM-provider call, so +// vendor-shaped keys must not leak to the very +// provider that issued them. // // Known gaps (extend when seen in production): -// URL-percent-encoded values, vendor-specific tokens (Stripe, Twilio, -// Datadog, etc.), generic high-entropy hex blobs (too many false -// positives without surrounding context), basic-auth URLs with raw -// "@" in the username (ambiguous to parse without percent-encoding). +// URL-percent-encoded values; non-LLM vendor tokens (Stripe, Twilio, +// Datadog, GCP / Google Maps, generic OAuth refresh shapes); generic +// high-entropy hex blobs (too many false positives without +// surrounding context); basic-auth URLs with raw "@" in the username +// (ambiguous to parse without percent-encoding). const SECRET_KEY_TOKENS = 'password|passwd|pwd|secret|token|api[_-]?key|apikey|access[_-]?key|private[_-]?key|client[_-]?secret|authorization|auth'; @@ -57,16 +76,28 @@ const PATTERNS: RedactionPattern[] = [ }, // scheme://user:pass@host. Password may contain "@"; the engine // backtracks to the last "@" before the host. Host is captured and - // preserved in the replacement. + // preserved in the replacement. Match is case-insensitive (RFC 3986 + // declares schemes case-insensitive, so HTTPS://user:pw@host is the + // same authority as the lowercase form). The scheme group covers + // HTTP-family protocols plus the database / queue / messaging + // connection strings most likely to land in observability payloads + // with embedded credentials. Schemes listed alphabetically. { name: 'basic-auth-url', - re: /\b(https?|ftp|ssh):\/\/([^/\s:@]+):([^/\s]+)@([^/\s@]+)/g, + re: /\b(amqps?|clickhouse|ftp|https?|kafka(?:\+ssl)?|ldaps?|mariadb|mongodb(?:\+srv)?|mssql|mysql|nats|postgres(?:ql)?|rediss?|sftp|smtps?|snowflake|sqlserver|ssh|wss?):\/\/([^/\s:@]+):([^/\s]+)@([^/\s@]+)/gi, replace: '$1://[REDACTED]:[REDACTED]@$4', }, - // Authorization: Bearer xxx + // Authorization: Bearer xxx. RFC 6750's b64token alphabet + // (alphanumerics, "-._~+/", optional "=" padding) is a strict + // subset of \S+, and observability payloads regularly carry + // opaque non-JWT bearers with ":", "%", or quote chars in them. + // Anything narrower than \S+ leaks the suffix past [REDACTED]; + // \S+ stops at whitespace, which is what actually terminates a + // bearer token in practice (header line break, log delimiter, + // JSON field boundary). { name: 'bearer', - re: /Bearer\s+[A-Za-z0-9._~+/=-]+/gi, + re: /Bearer\s+\S+/gi, replace: 'Bearer [REDACTED]', }, // Authorization: Basic xxx (base64 user:pass) @@ -102,6 +133,20 @@ const PATTERNS: RedactionPattern[] = [ re: /\bgh[pousr]_[A-Za-z0-9_]{20,}\b/g, replace: '[REDACTED_GITHUB_TOKEN]', }, + // LLM-vendor API keys: OpenAI ("sk-..."), Anthropic ("sk-ant-..."), + // and adjacent vendors that follow the "sk-" convention; plus + // Google Gemini ("AIza..." with 35 trailing chars, 39 total). + // This redactor specifically fronts an LLM call; a leaked vendor + // key would be exfiltrated to the very provider that issued it. + // The "sk-" branch floors at 20 chars after the prefix to avoid + // catching English words like "sk-ip-line" or short test fixtures + // while still covering OpenAI's 48+ char and Anthropic's longer + // formats. Gemini keys are exactly 39 chars wide. + { + name: 'llm-vendor-key', + re: /\b(?:sk-(?:ant-)?[A-Za-z0-9_-]{20,}|AIza[A-Za-z0-9_-]{35})\b/g, + replace: '[REDACTED_LLM_KEY]', + }, // key=value with secret-ish key. Three value forms: double-quoted, // single-quoted, or unquoted (stops at whitespace, comma, semicolon, // ampersand, quote so URL query-string boundaries are preserved).