From cd28c03e4da0d591d18b0f3123e0a6fd8dffdacb Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 25 Mar 2026 13:37:25 -0700 Subject: [PATCH 1/5] Add HTML decoder for secret detection in HTML-formatted sources Sources like MS Teams and Confluence emit HTML rather than plain text, causing secrets split across tags or embedded in attributes to be missed. This adds an HTML decoder to the pipeline that extracts text nodes, high-signal attribute values, script/style/comment content, and code blocks. It handles syntax-highlight boundary detection, zero-width character stripping, and double-encoded HTML entity decoding. Made-with: Cursor --- pkg/decoders/decoders.go | 1 + pkg/decoders/html.go | 251 +++++++++++++++++ pkg/decoders/html_test.go | 433 +++++++++++++++++++++++++++++ pkg/pb/detectorspb/detectors.pb.go | 3 + proto/detectors.proto | 1 + 5 files changed, 689 insertions(+) create mode 100644 pkg/decoders/html.go create mode 100644 pkg/decoders/html_test.go diff --git a/pkg/decoders/decoders.go b/pkg/decoders/decoders.go index c49eee403ff2..3cac3c36c833 100644 --- a/pkg/decoders/decoders.go +++ b/pkg/decoders/decoders.go @@ -12,6 +12,7 @@ func DefaultDecoders() []Decoder { &Base64{}, &UTF16{}, &EscapedUnicode{}, + &HTML{}, } } diff --git a/pkg/decoders/html.go b/pkg/decoders/html.go new file mode 100644 index 000000000000..c3f782ed8f3a --- /dev/null +++ b/pkg/decoders/html.go @@ -0,0 +1,251 @@ +package decoders + +import ( + "bytes" + "net/url" + "regexp" + "strings" + + "golang.org/x/net/html" + + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" + "github.com/trufflesecurity/trufflehog/v3/pkg/sources" +) + +// HTML is a decoder that extracts textual content from HTML documents. +// It produces a normalized view containing visible text, attribute values, +// script/style content, and HTML comments with entities and URL-encoding decoded. +type HTML struct { + // Enabled controls whether the decoder is active. When nil, the decoder + // is always active. Inject a function that checks a feature flag to + // allow dynamic toggling without restarting the scanner. + Enabled func() bool +} + +func (d *HTML) Type() detectorspb.DecoderType { + return detectorspb.DecoderType_HTML +} + +var htmlTagPattern = regexp.MustCompile(`<[a-zA-Z][a-zA-Z0-9]*[\s>/]`) + +// highSignalAttrs are attribute names whose values are extracted into the +// decoded output because they commonly contain URLs, tokens, or other secrets. +var highSignalAttrs = map[string]bool{ + "href": true, + "src": true, + "action": true, + "value": true, + "content": true, + "alt": true, + "title": true, + "xlink:href": true, +} + +// syntaxHighlightPrefixes lists CSS class prefixes used by syntax highlighting +// libraries. Elements with these classes mark logical line boundaries in code +// blocks where the platform (e.g. Teams) strips actual newlines. +var syntaxHighlightPrefixes = []string{"hljs-"} + +// residualEntityReplacer decodes common HTML entities that survive double-encoding. +// When content is entity-encoded twice (e.g. &amp;), the parser's first pass +// leaves residual entity sequences that this replacer cleans up. +var residualEntityReplacer = strings.NewReplacer( + "&", "&", + "<", "<", + ">", ">", + """, `"`, + "'", "'", + "'", "'", +) + +// invisibleReplacer strips zero-width and invisible Unicode codepoints that +// rich text editors may insert between characters, breaking detector regexes. +var invisibleReplacer = strings.NewReplacer( + "\u200B", "", // zero-width space + "\u200C", "", // zero-width non-joiner + "\u200D", "", // zero-width joiner + "\uFEFF", "", // byte order mark / zero-width no-break space + "\u00AD", "", // soft hyphen + "\u2060", "", // word joiner + "\u200E", "", // left-to-right mark + "\u200F", "", // right-to-left mark +) + +// blockElements insert newline boundaries when encountered during extraction. +var blockElements = map[string]bool{ + "p": true, "div": true, "br": true, "hr": true, + "h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true, + "li": true, "ol": true, "ul": true, + "tr": true, "td": true, "th": true, "table": true, "thead": true, "tbody": true, "tfoot": true, + "blockquote": true, "section": true, "article": true, "header": true, "footer": true, + "pre": true, "address": true, "figcaption": true, "figure": true, + "details": true, "summary": true, "main": true, "nav": true, "aside": true, + "form": true, "fieldset": true, "legend": true, + "dd": true, "dt": true, "dl": true, +} + +func (d *HTML) FromChunk(chunk *sources.Chunk) *DecodableChunk { + if d.Enabled != nil && !d.Enabled() { + return nil + } + if chunk == nil || len(chunk.Data) == 0 { + return nil + } + + if !looksLikeHTML(chunk.Data) { + return nil + } + + extracted := extractHTML(chunk.Data) + if len(extracted) == 0 { + return nil + } + + if bytes.Equal(chunk.Data, extracted) { + return nil + } + + chunk.Data = extracted + return &DecodableChunk{Chunk: chunk, DecoderType: d.Type()} +} + +func looksLikeHTML(data []byte) bool { + return htmlTagPattern.Match(data) +} + +func extractHTML(data []byte) []byte { + doc, err := html.Parse(bytes.NewReader(data)) + if err != nil { + return nil + } + + var buf bytes.Buffer + buf.Grow(len(data)) + + walkNode(&buf, doc) + + result := stripInvisible(buf.Bytes()) + result = decodeResidualEntities(result) + return normalizeWhitespace(result) +} + +func walkNode(buf *bytes.Buffer, n *html.Node) { + switch n.Type { + case html.TextNode: + text := n.Data + if text != "" { + buf.WriteString(text) + } + + case html.CommentNode: + if content := strings.TrimSpace(n.Data); content != "" { + ensureNewline(buf) + buf.WriteString(content) + ensureNewline(buf) + } + + case html.ElementNode: + isBlock := blockElements[n.Data] + + if isBlock { + ensureNewline(buf) + } else if hasSyntaxHighlightClass(n) { + ensureNewline(buf) + } + + emitAttributes(buf, n) + + for c := n.FirstChild; c != nil; c = c.NextSibling { + walkNode(buf, c) + } + + if isBlock || n.Data == "br" { + ensureNewline(buf) + } + + default: + for c := n.FirstChild; c != nil; c = c.NextSibling { + walkNode(buf, c) + } + } +} + +func hasSyntaxHighlightClass(n *html.Node) bool { + for _, attr := range n.Attr { + if attr.Key != "class" { + continue + } + for _, cls := range strings.Fields(attr.Val) { + for _, prefix := range syntaxHighlightPrefixes { + if strings.HasPrefix(cls, prefix) { + return true + } + } + } + } + return false +} + +func emitAttributes(buf *bytes.Buffer, n *html.Node) { + for _, attr := range n.Attr { + isDataAttr := strings.HasPrefix(attr.Key, "data-") + if !highSignalAttrs[attr.Key] && !isDataAttr { + continue + } + val := strings.TrimSpace(attr.Val) + if val == "" || val == "#" { + continue + } + decoded, err := url.QueryUnescape(val) + if err == nil && decoded != val { + val = decoded + } + ensureNewline(buf) + buf.WriteString(val) + ensureNewline(buf) + } +} + +func ensureNewline(buf *bytes.Buffer) { + if buf.Len() == 0 { + return + } + if buf.Bytes()[buf.Len()-1] != '\n' { + buf.WriteByte('\n') + } +} + +func stripInvisible(data []byte) []byte { + return []byte(invisibleReplacer.Replace(string(data))) +} + +func decodeResidualEntities(data []byte) []byte { + s := string(data) + decoded := residualEntityReplacer.Replace(s) + if decoded == s { + return data + } + return []byte(decoded) +} + +// normalizeWhitespace collapses runs of blank lines and trims leading/trailing whitespace. +func normalizeWhitespace(data []byte) []byte { + lines := bytes.Split(data, []byte("\n")) + var result [][]byte + prevBlank := true + for _, line := range lines { + trimmed := bytes.TrimSpace(line) + if len(trimmed) == 0 { + if !prevBlank { + prevBlank = true + } + continue + } + if prevBlank && len(result) > 0 { + result = append(result, []byte("")) + } + result = append(result, trimmed) + prevBlank = false + } + return bytes.Join(result, []byte("\n")) +} diff --git a/pkg/decoders/html_test.go b/pkg/decoders/html_test.go new file mode 100644 index 000000000000..8d9245851bd9 --- /dev/null +++ b/pkg/decoders/html_test.go @@ -0,0 +1,433 @@ +package decoders + +import ( + "testing" + + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" + "github.com/trufflesecurity/trufflehog/v3/pkg/sources" +) + +func TestHTML_Type(t *testing.T) { + d := &HTML{} + if got := d.Type(); got != detectorspb.DecoderType_HTML { + t.Errorf("Type() = %v, want %v", got, detectorspb.DecoderType_HTML) + } +} + +// TestHTML_FromChunk verifies the HTML decoder extracts secrets from HTML content +// that sources like MS Teams and Confluence emit. The test cases are grouped by +// the category of extraction they exercise: +// +// - Guard clauses: nil, empty, and non-HTML input return nil. +// - Text node extraction: secrets split across inline tags are rejoined; +// HTML entities (&) are decoded by the parser. +// - Attribute value extraction: high-signal attrs (href, src, data-*, value, +// content, alt, title, action) are emitted; URL percent-encoding is decoded; +// empty/anchor-only hrefs are skipped. +// - Script / style / comment content: all included because they frequently +// contain embedded credentials. +// - Code and pre blocks: preserved verbatim (common secret location). +// - Whitespace and token boundaries: block elements (p, div, br, tr, td, li) +// insert newlines; inline elements preserve text continuity to avoid +// accidental token joins. +// - Real-world formats: Confluence storage-format HTML and Teams message HTML +// with secrets in typical positions. +// - Integration: a mixed-content case exercises text nodes, URL-decoded attrs, +// script content, and HTML comments in a single chunk. +func TestHTML_FromChunk(t *testing.T) { + tests := []struct { + name string + chunk *sources.Chunk + want string + wantNil bool + }{ + // --- Guard clauses: decoder returns nil for non-applicable input --- + { + name: "nil chunk", + chunk: nil, + wantNil: true, + }, + { + name: "empty data", + chunk: &sources.Chunk{Data: []byte{}}, + wantNil: true, + }, + { + name: "plain text (no HTML)", + chunk: &sources.Chunk{Data: []byte("just some plain text with no tags")}, + wantNil: true, + }, + + // --- Text node extraction --- + { + // Core scenario: a secret is split across formatting tags by the + // rich-text editor. The parser concatenates adjacent text nodes. + name: "secret split across span tags", + chunk: &sources.Chunk{Data: []byte(`

AKIA1234567890ABCDEF

`)}, + want: "AKIA1234567890ABCDEF", + }, + { + // Confluence/Teams encode '&' as '&'. The HTML parser + // automatically unescapes entities so detector regexes can match. + name: "HTML entities decoded", + chunk: &sources.Chunk{Data: []byte(`

key=abc&secret=hunter2

`)}, + want: "key=abc&secret=hunter2", + }, + + // --- Attribute value extraction --- + { + // Secrets in href URLs (e.g. tokens in query params). + name: "attribute value extraction - href", + chunk: &sources.Chunk{Data: []byte(`link`)}, + want: "https://api.example.com?token=sk-live-1234\nlink", + }, + { + // Secrets in src URLs (e.g. image CDN tokens). + name: "attribute value extraction - src", + chunk: &sources.Chunk{Data: []byte(``)}, + want: "https://cdn.example.com/img?key=secret123", + }, + { + // Percent-encoded characters in attribute values (%2D -> '-', + // %5F -> '_') are decoded so detectors see the actual secret. + name: "URL-encoded attribute values decoded", + chunk: &sources.Chunk{Data: []byte(`docs`)}, + want: "https://api.example.com?token=sk-live_1234\ndocs", + }, + { + // data-* attributes are often used for JS-consumed config values. + name: "data-* attributes extracted", + chunk: &sources.Chunk{Data: []byte(`
content
`)}, + want: "ghp_abc123def456\ncontent", + }, + { + // src, alt, and title on a single element all extracted. + name: "multiple high-signal attributes on one element", + chunk: &sources.Chunk{Data: []byte(`secret: abc123`)}, + want: "https://api.com/img?key=k1\nsecret: abc123\ntoken: def456", + }, + { + // Anchors with href="#" carry no signal and are skipped. + name: "empty href skipped", + chunk: &sources.Chunk{Data: []byte(`click`)}, + want: "click", + }, + { + // Hidden inputs often carry CSRF tokens or API keys. + name: "value attribute on input", + chunk: &sources.Chunk{Data: []byte(``)}, + want: "sk_test_EXAMPLEKEYEXAMPLEKEYEX", + }, + { + // content attributes may carry API keys for client-side SDKs. + name: "meta content attribute", + chunk: &sources.Chunk{Data: []byte(``)}, + want: "pk_live_abcdefghij1234567890", + }, + { + // Form action URLs can embed secrets in query strings. + name: "action attribute on form", + chunk: &sources.Chunk{Data: []byte(`
`)}, + want: "https://api.stripe.com/v1/charges?key=sk_live_123\nPay", + }, + + // --- Script / style / comment content --- + { + // Inline `)}, + want: "hello\nvar secret = \"ghp_abc123def456\";", + }, + { + // CSS can contain secrets in background-image URLs, @import, etc. + name: "style content included", + chunk: &sources.Chunk{Data: []byte(`

text

`)}, + want: "text\nbody { background: url(\"https://cdn.com?key=secret\"); }", + }, + { + // HTML comments are a common place for debug credentials and + // TODO notes with hardcoded passwords. + name: "HTML comment content included", + chunk: &sources.Chunk{Data: []byte(`

visible

`)}, + want: "visible\nTODO: remove hardcoded password=hunter2", + }, + + // --- Code and pre blocks --- + { + //
/ content is preserved verbatim; these blocks are a
+			// top location for pasted credentials and key exports.
+			name:  "code/pre blocks preserved",
+			chunk: &sources.Chunk{Data: []byte(`
export AWS_SECRET_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
`)}, + want: "export AWS_SECRET_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + }, + { + // Multi-line PEM private keys in
 blocks with 
line breaks + // are reconstructed with proper newlines for detector matching. + name: "private key in pre block", + chunk: &sources.Chunk{Data: []byte(`
-----BEGIN RSA PRIVATE KEY-----
MIIEpAIBAAKCAQEA04up8h
-----END RSA PRIVATE KEY-----
`)}, + want: "-----BEGIN RSA PRIVATE KEY-----\nMIIEpAIBAAKCAQEA04up8h\n-----END RSA PRIVATE KEY-----", + }, + + // --- Whitespace and token boundaries --- + { + // Block elements (

) produce newline boundaries so adjacent + // paragraphs don't merge tokens. + name: "block elements produce newlines", + chunk: &sources.Chunk{Data: []byte(`

first

second

`)}, + want: "first\nsecond", + }, + { + // All
variants produce newlines. + name: "br tags produce newlines", + chunk: &sources.Chunk{Data: []byte(`

line1
line2
line3

`)}, + want: "line1\nline2\nline3", + }, + { + // Nested inline elements (, ) do not break the token; + // text flows continuously so "token=sk-live-abc123" stays intact. + name: "nested inline elements preserve text continuity", + chunk: &sources.Chunk{Data: []byte(`

token=sk-live-abc123

`)}, + want: "token=sk-live-abc123", + }, + { + // elements are block-level: each cell gets its own line, + // keeping key/value pairs from merging. + name: "table with secrets", + chunk: &sources.Chunk{Data: []byte( + `` + + `
API KeyAKIAIOSFODNN7EXAMPLE
SecretwJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
`, + )}, + want: "API Key\nAKIAIOSFODNN7EXAMPLE\nSecret\nwJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + }, + { + // Even without wrappers, still inserts block boundaries. + name: "td cells without enclosing tr still get block boundaries", + chunk: &sources.Chunk{Data: []byte( + `
keyvalue
`, + )}, + want: "key\nvalue", + }, + { + //
  • elements produce separate lines. + name: "list items produce separate lines", + chunk: &sources.Chunk{Data: []byte( + `
    • token: abc123
    • secret: def456
    `, + )}, + want: "token: abc123\nsecret: def456", + }, + + // --- Real-world source formats --- + { + // Confluence storage format: secrets split across tags, + // an AWS key in plain text, and an href with a URL. Exercises text + // node concatenation, attribute extraction, and block boundaries + // together. + name: "confluence storage format - real world", + chunk: &sources.Chunk{Data: []byte( + `

    Our API credentials:

    ` + + `

    Key: AKIAIOSFODNN7EXAMPLE

    ` + + `

    Secret: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY

    ` + + `

    See AWS Console

    `, + )}, + want: "Our API credentials:\nKey: AKIAIOSFODNN7EXAMPLE\nSecret: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nSee\nhttps://console.aws.amazon.com\nAWS Console", + }, + { + // Teams message HTML: nested
    wrappers around

    tags + // containing a GitHub PAT. Verifies that redundant block wrappers + // collapse to clean newlines. + name: "teams message HTML - real world", + chunk: &sources.Chunk{Data: []byte( + `

    ` + + `

    Here is the token for the staging env:

    ` + + `

    ghp_ABCDEFghijklmnop1234567890abcde

    ` + + `
    `, + )}, + want: "Here is the token for the staging env:\nghp_ABCDEFghijklmnop1234567890abcde", + }, + + // --- Syntax highlight boundary detection --- + { + // Teams renders code blocks as adjacent elements within a + // single

    , using highlight.js classes for syntax coloring. + // Newlines from the original code are lost. The decoder detects + // hljs-* classes and inserts newlines at those boundaries while + // still concatenating non-hljs sibling spans (preserving + // mid-token color splits like the value below split across 3 spans). + name: "teams code block with hljs syntax highlighting", + chunk: &sources.Chunk{Data: []byte( + `

    ` + + `[header]` + + `key_one` + + ` = FIRST_VALUE_ABCDEFGH` + + `key_two` + + ` = SECOND_VAL_PART_` + + `X` + + `_END_OF_VALUE` + + `format` + + ` = json` + + `

    `, + )}, + want: "[header]\nkey_one = FIRST_VALUE_ABCDEFGH\nkey_two = SECOND_VAL_PART_X_END_OF_VALUE\nformat = json", + }, + { + // Spans without hljs classes must still concatenate, preserving + // the existing split-secret behavior even when hljs spans are + // present elsewhere in the document. + name: "non-hljs sibling spans still concatenate", + chunk: &sources.Chunk{Data: []byte( + `

    SECRET_FIRST_HALF_1234

    `, + )}, + want: "SECRET_FIRST_HALF_1234", + }, + { + // Various hljs-* class names (not just hljs-function) should + // all trigger line boundaries. + name: "multiple hljs class variants trigger boundaries", + chunk: &sources.Chunk{Data: []byte( + `

    ` + + `const` + + ` x = ` + + `"value_one"` + + `const` + + ` y = ` + + `"value_two"` + + `

    `, + )}, + want: "const x =\n\"value_one\"\nconst y =\n\"value_two\"", + }, + + // --- Zero-width / invisible character stripping --- + { + // Zero-width spaces inserted between characters by rich text editors + // are stripped so detector regexes can match the full token. + name: "zero-width space stripped from secret", + chunk: &sources.Chunk{Data: []byte("

    TOKEN_\u200BABCDEF_1234

    ")}, + want: "TOKEN_ABCDEF_1234", + }, + { + // Multiple invisible codepoint types mixed into a single token. + name: "multiple invisible character types stripped", + chunk: &sources.Chunk{Data: []byte("

    SECRET\u200C_VALUE\u00AD_HERE\u2060_END\uFEFF

    ")}, + want: "SECRET_VALUE_HERE_END", + }, + + // --- SVG xlink:href attribute extraction --- + { + // SVG elements use xlink:href for URLs which may contain tokens. + name: "xlink:href extracted from SVG element", + chunk: &sources.Chunk{Data: []byte(`icon`)}, + want: "https://api.example.com?token=secret_value_123\nicon", + }, + + // --- Double-encoded HTML entity decoding --- + { + // Content double-encoded as &amp; becomes & after the parser's + // first pass; the residual entity replacer decodes it to &. + name: "double-encoded ampersand decoded", + chunk: &sources.Chunk{Data: []byte(`

    key=abc&amp;secret=val

    `)}, + want: "key=abc&secret=val", + }, + { + // Single-encoded entities are handled by the parser; verify the + // residual replacer does not corrupt already-decoded content. + name: "single-encoded entities not double-decoded", + chunk: &sources.Chunk{Data: []byte(`

    5 > 3 & 2 < 4

    `)}, + want: "5 > 3 & 2 < 4", + }, + + // --- Integration: all extraction types in one chunk --- + { + // Combines text nodes (split across spans), URL-decoded attribute + // values, inline script content, and an HTML comment -- all in a + // single chunk. Verifies the decoder handles the full extraction + // surface simultaneously. + name: "mixed content with all extraction types", + chunk: &sources.Chunk{Data: []byte( + `

    API key: AKIA1234567890ABCDEF

    ` + + `

    See docs

    ` + + `` + + ``, + )}, + want: "API key: AKIA1234567890ABCDEF\nSee\nhttps://api.example.com?token=sk-live_1234\ndocs\nvar secret = \"ghp_abc123def456\";\nTODO: remove hardcoded password=hunter2", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + d := &HTML{} + got := d.FromChunk(tt.chunk) + + if tt.wantNil { + if got != nil { + t.Errorf("FromChunk() = %q, want nil", string(got.Chunk.Data)) + } + return + } + + if got == nil { + t.Fatalf("FromChunk() returned nil, want %q", tt.want) + } + if got.DecoderType != detectorspb.DecoderType_HTML { + t.Errorf("DecoderType = %v, want %v", got.DecoderType, detectorspb.DecoderType_HTML) + } + if string(got.Chunk.Data) != tt.want { + t.Errorf("FromChunk() data =\n%q\nwant:\n%q", string(got.Chunk.Data), tt.want) + } + }) + } +} + +// TestHTML_FeatureFlagDisabled verifies that the decoder is a no-op when the +// Enabled callback returns false, allowing the HTMLDecoder feature flag to +// gate the decoder at runtime without removing it from the pipeline. +func TestHTML_FeatureFlagDisabled(t *testing.T) { + d := &HTML{Enabled: func() bool { return false }} + chunk := &sources.Chunk{Data: []byte(`

    secret: hunter2

    `)} + if got := d.FromChunk(chunk); got != nil { + t.Errorf("FromChunk() should return nil when disabled, got %q", string(got.Chunk.Data)) + } +} + +// TestHTML_FeatureFlagEnabled verifies that the decoder processes HTML normally +// when the Enabled callback returns true. +func TestHTML_FeatureFlagEnabled(t *testing.T) { + d := &HTML{Enabled: func() bool { return true }} + chunk := &sources.Chunk{Data: []byte(`

    secret: hunter2

    `)} + got := d.FromChunk(chunk) + if got == nil { + t.Fatal("FromChunk() returned nil, want decoded chunk") + } + if string(got.Chunk.Data) != "secret: hunter2" { + t.Errorf("FromChunk() data = %q, want %q", string(got.Chunk.Data), "secret: hunter2") + } +} + +// TestLooksLikeHTML verifies the fast heuristic that decides whether chunk data +// is worth parsing as HTML. It must accept valid HTML tags (including self-closing +// and attribute-bearing) while rejecting plain text, arithmetic comparisons, and +// bare HTML entities -- all of which could appear in non-HTML source content. +func TestLooksLikeHTML(t *testing.T) { + tests := []struct { + name string + data string + want bool + }{ + {"simple tag", "

    hello

    ", true}, + {"self-closing", "
    ", true}, + {"with attributes", `
    `, true}, + {"plain text", "no html here", false}, + {"angle brackets but not HTML", "5 < 10 and 20 > 15", false}, + {"XML-like", "content", true}, + {"just less-than", "a < b", false}, + {"html entity only", "& <", false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := looksLikeHTML([]byte(tt.data)); got != tt.want { + t.Errorf("looksLikeHTML(%q) = %v, want %v", tt.data, got, tt.want) + } + }) + } +} diff --git a/pkg/pb/detectorspb/detectors.pb.go b/pkg/pb/detectorspb/detectors.pb.go index 3ec2468eb95d..b942e1d75dc7 100644 --- a/pkg/pb/detectorspb/detectors.pb.go +++ b/pkg/pb/detectorspb/detectors.pb.go @@ -28,6 +28,7 @@ const ( DecoderType_BASE64 DecoderType = 2 DecoderType_UTF16 DecoderType = 3 DecoderType_ESCAPED_UNICODE DecoderType = 4 + DecoderType_HTML DecoderType = 5 ) // Enum value maps for DecoderType. @@ -38,6 +39,7 @@ var ( 2: "BASE64", 3: "UTF16", 4: "ESCAPED_UNICODE", + 5: "HTML", } DecoderType_value = map[string]int32{ "UNKNOWN": 0, @@ -45,6 +47,7 @@ var ( "BASE64": 2, "UTF16": 3, "ESCAPED_UNICODE": 4, + "HTML": 5, } ) diff --git a/proto/detectors.proto b/proto/detectors.proto index 88829dd17d3c..076b848a6c09 100644 --- a/proto/detectors.proto +++ b/proto/detectors.proto @@ -10,6 +10,7 @@ enum DecoderType { BASE64 = 2; UTF16 = 3; ESCAPED_UNICODE = 4; + HTML = 5; } enum DetectorType { From e4330922d83a1f6a897fed617a41f7163054c740 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Apr 2026 13:01:46 -0700 Subject: [PATCH 2/5] Fix dead code and plus-sign corruption in HTML decoder - Remove unreachable "xlink:href" map entry: the html parser splits namespace-prefixed attributes into separate Namespace/Key fields, so attr.Key is "href" (already in the map), never "xlink:href". - Switch url.QueryUnescape to url.PathUnescape: QueryUnescape converts '+' to space per form-encoding spec, corrupting secrets that contain literal '+' characters (e.g. base64 values, API keys). Made-with: Cursor --- pkg/decoders/html.go | 7 +++---- pkg/decoders/html_test.go | 8 ++++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pkg/decoders/html.go b/pkg/decoders/html.go index c3f782ed8f3a..016bacb0f87d 100644 --- a/pkg/decoders/html.go +++ b/pkg/decoders/html.go @@ -36,9 +36,8 @@ var highSignalAttrs = map[string]bool{ "action": true, "value": true, "content": true, - "alt": true, - "title": true, - "xlink:href": true, + "alt": true, + "title": true, } // syntaxHighlightPrefixes lists CSS class prefixes used by syntax highlighting @@ -196,7 +195,7 @@ func emitAttributes(buf *bytes.Buffer, n *html.Node) { if val == "" || val == "#" { continue } - decoded, err := url.QueryUnescape(val) + decoded, err := url.PathUnescape(val) if err == nil && decoded != val { val = decoded } diff --git a/pkg/decoders/html_test.go b/pkg/decoders/html_test.go index 8d9245851bd9..6a8205555520 100644 --- a/pkg/decoders/html_test.go +++ b/pkg/decoders/html_test.go @@ -131,6 +131,14 @@ func TestHTML_FromChunk(t *testing.T) { want: "https://api.stripe.com/v1/charges?key=sk_live_123\nPay", }, + { + // '+' is a literal character in attribute values (not a space). + // PathUnescape preserves it while still decoding %XX sequences. + name: "plus sign preserved in attribute value", + chunk: &sources.Chunk{Data: []byte(``)}, + want: "sk_test_abc+def/123", + }, + // --- Script / style / comment content --- { // Inline `)}, + want: "text\nvar key=\"secret\";", + }, + { + // Style following an inline element must NOT concatenate. + name: "style adjacent to inline text gets boundary", + chunk: &sources.Chunk{Data: []byte(`text`)}, + want: "text\n.x { color: red; }", + }, + { + // Entity-like sequences in script content are raw text and must + // NOT be decoded by the residual entity replacer. + name: "entities in script preserved as raw text", + chunk: &sources.Chunk{Data: []byte(``)}, + want: `var url = "a=1&b=2";`, + }, + { + // Entity-like sequences in style content are raw text. + name: "entities in style preserved as raw text", + chunk: &sources.Chunk{Data: []byte(``)}, + want: `body::after { content: "&copy"; }`, + }, { // HTML comments are a common place for debug credentials and // TODO notes with hardcoded passwords.