diff --git a/pkg/decoders/decoders.go b/pkg/decoders/decoders.go index c49eee403ff2..3cac3c36c833 100644 --- a/pkg/decoders/decoders.go +++ b/pkg/decoders/decoders.go @@ -12,6 +12,7 @@ func DefaultDecoders() []Decoder { &Base64{}, &UTF16{}, &EscapedUnicode{}, + &HTML{}, } } diff --git a/pkg/decoders/html.go b/pkg/decoders/html.go new file mode 100644 index 000000000000..93e7366bd1c9 --- /dev/null +++ b/pkg/decoders/html.go @@ -0,0 +1,251 @@ +package decoders + +import ( + "bytes" + "net/url" + "regexp" + "strings" + + "golang.org/x/net/html" + + "github.com/trufflesecurity/trufflehog/v3/pkg/feature" + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" + "github.com/trufflesecurity/trufflehog/v3/pkg/sources" +) + +// HTML is a decoder that extracts textual content from HTML documents. +// It produces a normalized view containing visible text, attribute values, +// script/style content, and HTML comments with entities and URL-encoding decoded. +// Gated at runtime by feature.HTMLDecoderEnabled. +type HTML struct{} + +func (d *HTML) Type() detectorspb.DecoderType { + return detectorspb.DecoderType_HTML +} + +var htmlTagPattern = regexp.MustCompile(`<[a-zA-Z][a-zA-Z0-9]*[\s>/]`) + +// highSignalAttrs are attribute names whose values are extracted into the +// decoded output because they commonly contain URLs, tokens, or other secrets. +var highSignalAttrs = map[string]bool{ + "href": true, + "src": true, + "action": true, + "value": true, + "content": true, + "alt": true, + "title": true, +} + +// syntaxHighlightPrefixes lists CSS class prefixes used by syntax highlighting +// libraries. Elements with these classes mark logical line boundaries in code +// blocks where the platform (e.g. Teams) strips actual newlines. +var syntaxHighlightPrefixes = []string{"hljs-"} + +// residualEntityReplacer decodes common HTML entities that survive double-encoding. +// When content is entity-encoded twice (e.g. &amp;), the parser's first pass +// leaves residual entity sequences that this replacer cleans up. +var residualEntityReplacer = strings.NewReplacer( + "&", "&", + "<", "<", + ">", ">", + """, `"`, + "'", "'", + "'", "'", +) + +// invisibleReplacer strips zero-width and invisible Unicode codepoints that +// rich text editors may insert between characters, breaking detector regexes. +var invisibleReplacer = strings.NewReplacer( + "\u200B", "", // zero-width space + "\u200C", "", // zero-width non-joiner + "\u200D", "", // zero-width joiner + "\uFEFF", "", // byte order mark / zero-width no-break space + "\u00AD", "", // soft hyphen + "\u2060", "", // word joiner + "\u200E", "", // left-to-right mark + "\u200F", "", // right-to-left mark +) + +// blockElements insert newline boundaries when encountered during extraction. +var blockElements = map[string]bool{ + "p": true, "div": true, "br": true, "hr": true, + "h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true, + "li": true, "ol": true, "ul": true, + "tr": true, "td": true, "th": true, "table": true, "thead": true, "tbody": true, "tfoot": true, + "blockquote": true, "section": true, "article": true, "header": true, "footer": true, + "pre": true, "address": true, "figcaption": true, "figure": true, + "details": true, "summary": true, "main": true, "nav": true, "aside": true, + "form": true, "fieldset": true, "legend": true, + "dd": true, "dt": true, "dl": true, + "script": true, "style": true, +} + +// rawTextElements are elements whose content the HTML parser treats as raw +// text (entities are NOT decoded). Residual entity decoding must be skipped +// for text nodes inside these elements to avoid corrupting literal sequences +// like & in JavaScript. +var rawTextElements = map[string]bool{ + "script": true, + "style": true, +} + +func (d *HTML) FromChunk(chunk *sources.Chunk) *DecodableChunk { + if !feature.HTMLDecoderEnabled.Load() { + return nil + } + if chunk == nil || len(chunk.Data) == 0 { + return nil + } + + if !looksLikeHTML(chunk.Data) { + return nil + } + + extracted := extractHTML(chunk.Data) + if len(extracted) == 0 { + return nil + } + + if bytes.Equal(chunk.Data, extracted) { + return nil + } + + chunk.Data = extracted + return &DecodableChunk{Chunk: chunk, DecoderType: d.Type()} +} + +func looksLikeHTML(data []byte) bool { + return htmlTagPattern.Match(data) +} + +func extractHTML(data []byte) []byte { + doc, err := html.Parse(bytes.NewReader(data)) + if err != nil { + return nil + } + + var buf bytes.Buffer + buf.Grow(len(data)) + + walkNode(&buf, doc, false) + + result := stripInvisible(buf.Bytes()) + return normalizeWhitespace(result) +} + +func walkNode(buf *bytes.Buffer, n *html.Node, inRawText bool) { + switch n.Type { + case html.TextNode: + text := n.Data + if text != "" { + if !inRawText { + text = residualEntityReplacer.Replace(text) + } + buf.WriteString(text) + } + + case html.CommentNode: + if content := strings.TrimSpace(n.Data); content != "" { + ensureNewline(buf) + buf.WriteString(content) + ensureNewline(buf) + } + + case html.ElementNode: + isBlock := blockElements[n.Data] + + if isBlock { + ensureNewline(buf) + } else if hasSyntaxHighlightClass(n) { + ensureNewline(buf) + } + + emitAttributes(buf, n) + + childRaw := inRawText || rawTextElements[n.Data] + for c := n.FirstChild; c != nil; c = c.NextSibling { + walkNode(buf, c, childRaw) + } + + if isBlock { + ensureNewline(buf) + } + + default: + for c := n.FirstChild; c != nil; c = c.NextSibling { + walkNode(buf, c, inRawText) + } + } +} + +func hasSyntaxHighlightClass(n *html.Node) bool { + for _, attr := range n.Attr { + if attr.Key != "class" { + continue + } + for _, cls := range strings.Fields(attr.Val) { + for _, prefix := range syntaxHighlightPrefixes { + if strings.HasPrefix(cls, prefix) { + return true + } + } + } + } + return false +} + +func emitAttributes(buf *bytes.Buffer, n *html.Node) { + for _, attr := range n.Attr { + isDataAttr := strings.HasPrefix(attr.Key, "data-") + if !highSignalAttrs[attr.Key] && !isDataAttr { + continue + } + val := strings.TrimSpace(attr.Val) + if val == "" || val == "#" { + continue + } + decoded, err := url.PathUnescape(val) + if err == nil && decoded != val { + val = decoded + } + ensureNewline(buf) + buf.WriteString(val) + ensureNewline(buf) + } +} + +func ensureNewline(buf *bytes.Buffer) { + if buf.Len() == 0 { + return + } + if buf.Bytes()[buf.Len()-1] != '\n' { + buf.WriteByte('\n') + } +} + +func stripInvisible(data []byte) []byte { + return []byte(invisibleReplacer.Replace(string(data))) +} + +// normalizeWhitespace collapses runs of blank lines and trims leading/trailing whitespace. +func normalizeWhitespace(data []byte) []byte { + lines := bytes.Split(data, []byte("\n")) + var result [][]byte + prevBlank := true + for _, line := range lines { + trimmed := bytes.TrimSpace(line) + if len(trimmed) == 0 { + if !prevBlank { + prevBlank = true + } + continue + } + if prevBlank && len(result) > 0 { + result = append(result, []byte("")) + } + result = append(result, trimmed) + prevBlank = false + } + return bytes.Join(result, []byte("\n")) +} diff --git a/pkg/decoders/html_test.go b/pkg/decoders/html_test.go new file mode 100644 index 000000000000..403ebdb2f93a --- /dev/null +++ b/pkg/decoders/html_test.go @@ -0,0 +1,474 @@ +package decoders + +import ( + "testing" + + "github.com/trufflesecurity/trufflehog/v3/pkg/feature" + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" + "github.com/trufflesecurity/trufflehog/v3/pkg/sources" +) + +func TestHTML_Type(t *testing.T) { + d := &HTML{} + if got := d.Type(); got != detectorspb.DecoderType_HTML { + t.Errorf("Type() = %v, want %v", got, detectorspb.DecoderType_HTML) + } +} + +// TestHTML_FromChunk verifies the HTML decoder extracts secrets from HTML content +// that sources like MS Teams and Confluence emit. The test cases are grouped by +// the category of extraction they exercise: +// +// - Guard clauses: nil, empty, and non-HTML input return nil. +// - Text node extraction: secrets split across inline tags are rejoined; +// HTML entities (&) are decoded by the parser. +// - Attribute value extraction: high-signal attrs (href, src, data-*, value, +// content, alt, title, action) are emitted; URL percent-encoding is decoded; +// empty/anchor-only hrefs are skipped. +// - Script / style / comment content: all included because they frequently +// contain embedded credentials. +// - Code and pre blocks: preserved verbatim (common secret location). +// - Whitespace and token boundaries: block elements (p, div, br, tr, td, li) +// insert newlines; inline elements preserve text continuity to avoid +// accidental token joins. +// - Real-world formats: Confluence storage-format HTML and Teams message HTML +// with secrets in typical positions. +// - Integration: a mixed-content case exercises text nodes, URL-decoded attrs, +// script content, and HTML comments in a single chunk. +func TestHTML_FromChunk(t *testing.T) { + tests := []struct { + name string + chunk *sources.Chunk + want string + wantNil bool + }{ + // --- Guard clauses: decoder returns nil for non-applicable input --- + { + name: "nil chunk", + chunk: nil, + wantNil: true, + }, + { + name: "empty data", + chunk: &sources.Chunk{Data: []byte{}}, + wantNil: true, + }, + { + name: "plain text (no HTML)", + chunk: &sources.Chunk{Data: []byte("just some plain text with no tags")}, + wantNil: true, + }, + + // --- Text node extraction --- + { + // Core scenario: a secret is split across formatting tags by the + // rich-text editor. The parser concatenates adjacent text nodes. + name: "secret split across span tags", + chunk: &sources.Chunk{Data: []byte(`

AKIA1234567890ABCDEF

`)}, + want: "AKIA1234567890ABCDEF", + }, + { + // Confluence/Teams encode '&' as '&'. The HTML parser + // automatically unescapes entities so detector regexes can match. + name: "HTML entities decoded", + chunk: &sources.Chunk{Data: []byte(`

key=abc&secret=hunter2

`)}, + want: "key=abc&secret=hunter2", + }, + + // --- Attribute value extraction --- + { + // Secrets in href URLs (e.g. tokens in query params). + name: "attribute value extraction - href", + chunk: &sources.Chunk{Data: []byte(`link`)}, + want: "https://api.example.com?token=sk-live-1234\nlink", + }, + { + // Secrets in src URLs (e.g. image CDN tokens). + name: "attribute value extraction - src", + chunk: &sources.Chunk{Data: []byte(``)}, + want: "https://cdn.example.com/img?key=secret123", + }, + { + // Percent-encoded characters in attribute values (%2D -> '-', + // %5F -> '_') are decoded so detectors see the actual secret. + name: "URL-encoded attribute values decoded", + chunk: &sources.Chunk{Data: []byte(`docs`)}, + want: "https://api.example.com?token=sk-live_1234\ndocs", + }, + { + // data-* attributes are often used for JS-consumed config values. + name: "data-* attributes extracted", + chunk: &sources.Chunk{Data: []byte(`
content
`)}, + want: "ghp_abc123def456\ncontent", + }, + { + // src, alt, and title on a single element all extracted. + name: "multiple high-signal attributes on one element", + chunk: &sources.Chunk{Data: []byte(`secret: abc123`)}, + want: "https://api.com/img?key=k1\nsecret: abc123\ntoken: def456", + }, + { + // Anchors with href="#" carry no signal and are skipped. + name: "empty href skipped", + chunk: &sources.Chunk{Data: []byte(`click`)}, + want: "click", + }, + { + // Hidden inputs often carry CSRF tokens or API keys. + name: "value attribute on input", + chunk: &sources.Chunk{Data: []byte(``)}, + want: "sk_test_EXAMPLEKEYEXAMPLEKEYEX", + }, + { + // content attributes may carry API keys for client-side SDKs. + name: "meta content attribute", + chunk: &sources.Chunk{Data: []byte(``)}, + want: "pk_live_abcdefghij1234567890", + }, + { + // Form action URLs can embed secrets in query strings. + name: "action attribute on form", + chunk: &sources.Chunk{Data: []byte(`
`)}, + want: "https://api.stripe.com/v1/charges?key=sk_live_123\nPay", + }, + + { + // '+' is a literal character in attribute values (not a space). + // PathUnescape preserves it while still decoding %XX sequences. + name: "plus sign preserved in attribute value", + chunk: &sources.Chunk{Data: []byte(``)}, + want: "sk_test_abc+def/123", + }, + + // --- Script / style / comment content --- + { + // Inline `)}, + want: "hello\nvar secret = \"ghp_abc123def456\";", + }, + { + // CSS can contain secrets in background-image URLs, @import, etc. + name: "style content included", + chunk: &sources.Chunk{Data: []byte(`

text

`)}, + want: "text\nbody { background: url(\"https://cdn.com?key=secret\"); }", + }, + { + // Script following an inline element must NOT concatenate with + // the preceding text; it needs its own newline boundary. + name: "script adjacent to inline text gets boundary", + chunk: &sources.Chunk{Data: []byte(`text`)}, + want: "text\nvar key=\"secret\";", + }, + { + // Style following an inline element must NOT concatenate. + name: "style adjacent to inline text gets boundary", + chunk: &sources.Chunk{Data: []byte(`text`)}, + want: "text\n.x { color: red; }", + }, + { + // Entity-like sequences in script content are raw text and must + // NOT be decoded by the residual entity replacer. + name: "entities in script preserved as raw text", + chunk: &sources.Chunk{Data: []byte(``)}, + want: `var url = "a=1&b=2";`, + }, + { + // Entity-like sequences in style content are raw text. + name: "entities in style preserved as raw text", + chunk: &sources.Chunk{Data: []byte(``)}, + want: `body::after { content: "&copy"; }`, + }, + { + // HTML comments are a common place for debug credentials and + // TODO notes with hardcoded passwords. + name: "HTML comment content included", + chunk: &sources.Chunk{Data: []byte(`

visible

`)}, + want: "visible\nTODO: remove hardcoded password=hunter2", + }, + + // --- Code and pre blocks --- + { + //
/ content is preserved verbatim; these blocks are a
+			// top location for pasted credentials and key exports.
+			name:  "code/pre blocks preserved",
+			chunk: &sources.Chunk{Data: []byte(`
export AWS_SECRET_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
`)}, + want: "export AWS_SECRET_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + }, + { + // Multi-line PEM private keys in
 blocks with 
line breaks + // are reconstructed with proper newlines for detector matching. + name: "private key in pre block", + chunk: &sources.Chunk{Data: []byte(`
-----BEGIN RSA PRIVATE KEY-----
MIIEpAIBAAKCAQEA04up8h
-----END RSA PRIVATE KEY-----
`)}, + want: "-----BEGIN RSA PRIVATE KEY-----\nMIIEpAIBAAKCAQEA04up8h\n-----END RSA PRIVATE KEY-----", + }, + + // --- Whitespace and token boundaries --- + { + // Block elements (

) produce newline boundaries so adjacent + // paragraphs don't merge tokens. + name: "block elements produce newlines", + chunk: &sources.Chunk{Data: []byte(`

first

second

`)}, + want: "first\nsecond", + }, + { + // All
variants produce newlines. + name: "br tags produce newlines", + chunk: &sources.Chunk{Data: []byte(`

line1
line2
line3

`)}, + want: "line1\nline2\nline3", + }, + { + // Nested inline elements (, ) do not break the token; + // text flows continuously so "token=sk-live-abc123" stays intact. + name: "nested inline elements preserve text continuity", + chunk: &sources.Chunk{Data: []byte(`

token=sk-live-abc123

`)}, + want: "token=sk-live-abc123", + }, + { + // elements are block-level: each cell gets its own line, + // keeping key/value pairs from merging. + name: "table with secrets", + chunk: &sources.Chunk{Data: []byte( + `` + + `
API KeyAKIAIOSFODNN7EXAMPLE
SecretwJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
`, + )}, + want: "API Key\nAKIAIOSFODNN7EXAMPLE\nSecret\nwJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + }, + { + // Even without wrappers, still inserts block boundaries. + name: "td cells without enclosing tr still get block boundaries", + chunk: &sources.Chunk{Data: []byte( + `
keyvalue
`, + )}, + want: "key\nvalue", + }, + { + //
  • elements produce separate lines. + name: "list items produce separate lines", + chunk: &sources.Chunk{Data: []byte( + `
    • token: abc123
    • secret: def456
    `, + )}, + want: "token: abc123\nsecret: def456", + }, + + // --- Real-world source formats --- + { + // Confluence storage format: secrets split across tags, + // an AWS key in plain text, and an href with a URL. Exercises text + // node concatenation, attribute extraction, and block boundaries + // together. + name: "confluence storage format - real world", + chunk: &sources.Chunk{Data: []byte( + `

    Our API credentials:

    ` + + `

    Key: AKIAIOSFODNN7EXAMPLE

    ` + + `

    Secret: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY

    ` + + `

    See AWS Console

    `, + )}, + want: "Our API credentials:\nKey: AKIAIOSFODNN7EXAMPLE\nSecret: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nSee\nhttps://console.aws.amazon.com\nAWS Console", + }, + { + // Teams message HTML: nested
    wrappers around

    tags + // containing a GitHub PAT. Verifies that redundant block wrappers + // collapse to clean newlines. + name: "teams message HTML - real world", + chunk: &sources.Chunk{Data: []byte( + `

    ` + + `

    Here is the token for the staging env:

    ` + + `

    ghp_ABCDEFghijklmnop1234567890abcde

    ` + + `
    `, + )}, + want: "Here is the token for the staging env:\nghp_ABCDEFghijklmnop1234567890abcde", + }, + + // --- Syntax highlight boundary detection --- + { + // Teams renders code blocks as adjacent elements within a + // single

    , using highlight.js classes for syntax coloring. + // Newlines from the original code are lost. The decoder detects + // hljs-* classes and inserts newlines at those boundaries while + // still concatenating non-hljs sibling spans (preserving + // mid-token color splits like the value below split across 3 spans). + name: "teams code block with hljs syntax highlighting", + chunk: &sources.Chunk{Data: []byte( + `

    ` + + `[header]` + + `key_one` + + ` = FIRST_VALUE_ABCDEFGH` + + `key_two` + + ` = SECOND_VAL_PART_` + + `X` + + `_END_OF_VALUE` + + `format` + + ` = json` + + `

    `, + )}, + want: "[header]\nkey_one = FIRST_VALUE_ABCDEFGH\nkey_two = SECOND_VAL_PART_X_END_OF_VALUE\nformat = json", + }, + { + // Spans without hljs classes must still concatenate, preserving + // the existing split-secret behavior even when hljs spans are + // present elsewhere in the document. + name: "non-hljs sibling spans still concatenate", + chunk: &sources.Chunk{Data: []byte( + `

    SECRET_FIRST_HALF_1234

    `, + )}, + want: "SECRET_FIRST_HALF_1234", + }, + { + // Various hljs-* class names (not just hljs-function) should + // all trigger line boundaries. + name: "multiple hljs class variants trigger boundaries", + chunk: &sources.Chunk{Data: []byte( + `

    ` + + `const` + + ` x = ` + + `"value_one"` + + `const` + + ` y = ` + + `"value_two"` + + `

    `, + )}, + want: "const x =\n\"value_one\"\nconst y =\n\"value_two\"", + }, + + // --- Zero-width / invisible character stripping --- + { + // Zero-width spaces inserted between characters by rich text editors + // are stripped so detector regexes can match the full token. + name: "zero-width space stripped from secret", + chunk: &sources.Chunk{Data: []byte("

    TOKEN_\u200BABCDEF_1234

    ")}, + want: "TOKEN_ABCDEF_1234", + }, + { + // Multiple invisible codepoint types mixed into a single token. + name: "multiple invisible character types stripped", + chunk: &sources.Chunk{Data: []byte("

    SECRET\u200C_VALUE\u00AD_HERE\u2060_END\uFEFF

    ")}, + want: "SECRET_VALUE_HERE_END", + }, + + // --- SVG xlink:href attribute extraction --- + { + // SVG elements use xlink:href for URLs which may contain tokens. + name: "xlink:href extracted from SVG element", + chunk: &sources.Chunk{Data: []byte(`icon`)}, + want: "https://api.example.com?token=secret_value_123\nicon", + }, + + // --- Double-encoded HTML entity decoding --- + { + // Content double-encoded as &amp; becomes & after the parser's + // first pass; the residual entity replacer decodes it to &. + name: "double-encoded ampersand decoded", + chunk: &sources.Chunk{Data: []byte(`

    key=abc&amp;secret=val

    `)}, + want: "key=abc&secret=val", + }, + { + // Single-encoded entities are handled by the parser; verify the + // residual replacer does not corrupt already-decoded content. + name: "single-encoded entities not double-decoded", + chunk: &sources.Chunk{Data: []byte(`

    5 > 3 & 2 < 4

    `)}, + want: "5 > 3 & 2 < 4", + }, + + // --- Integration: all extraction types in one chunk --- + { + // Combines text nodes (split across spans), URL-decoded attribute + // values, inline script content, and an HTML comment -- all in a + // single chunk. Verifies the decoder handles the full extraction + // surface simultaneously. + name: "mixed content with all extraction types", + chunk: &sources.Chunk{Data: []byte( + `

    API key: AKIA1234567890ABCDEF

    ` + + `

    See docs

    ` + + `` + + ``, + )}, + want: "API key: AKIA1234567890ABCDEF\nSee\nhttps://api.example.com?token=sk-live_1234\ndocs\nvar secret = \"ghp_abc123def456\";\nTODO: remove hardcoded password=hunter2", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + feature.HTMLDecoderEnabled.Store(true) + defer feature.HTMLDecoderEnabled.Store(false) + + d := &HTML{} + got := d.FromChunk(tt.chunk) + + if tt.wantNil { + if got != nil { + t.Errorf("FromChunk() = %q, want nil", string(got.Chunk.Data)) + } + return + } + + if got == nil { + t.Fatalf("FromChunk() returned nil, want %q", tt.want) + } + if got.DecoderType != detectorspb.DecoderType_HTML { + t.Errorf("DecoderType = %v, want %v", got.DecoderType, detectorspb.DecoderType_HTML) + } + if string(got.Chunk.Data) != tt.want { + t.Errorf("FromChunk() data =\n%q\nwant:\n%q", string(got.Chunk.Data), tt.want) + } + }) + } +} + +// TestHTML_FeatureFlagDisabled verifies that the decoder is a no-op when +// feature.HTMLDecoderEnabled is false. +func TestHTML_FeatureFlagDisabled(t *testing.T) { + feature.HTMLDecoderEnabled.Store(false) + d := &HTML{} + chunk := &sources.Chunk{Data: []byte(`

    secret: hunter2

    `)} + if got := d.FromChunk(chunk); got != nil { + t.Errorf("FromChunk() should return nil when disabled, got %q", string(got.Chunk.Data)) + } +} + +// TestHTML_FeatureFlagEnabled verifies that the decoder processes HTML normally +// when feature.HTMLDecoderEnabled is true. +func TestHTML_FeatureFlagEnabled(t *testing.T) { + feature.HTMLDecoderEnabled.Store(true) + defer feature.HTMLDecoderEnabled.Store(false) + + d := &HTML{} + chunk := &sources.Chunk{Data: []byte(`

    secret: hunter2

    `)} + got := d.FromChunk(chunk) + if got == nil { + t.Fatal("FromChunk() returned nil, want decoded chunk") + } + if string(got.Chunk.Data) != "secret: hunter2" { + t.Errorf("FromChunk() data = %q, want %q", string(got.Chunk.Data), "secret: hunter2") + } +} + +// TestLooksLikeHTML verifies the fast heuristic that decides whether chunk data +// is worth parsing as HTML. It must accept valid HTML tags (including self-closing +// and attribute-bearing) while rejecting plain text, arithmetic comparisons, and +// bare HTML entities -- all of which could appear in non-HTML source content. +func TestLooksLikeHTML(t *testing.T) { + tests := []struct { + name string + data string + want bool + }{ + {"simple tag", "

    hello

    ", true}, + {"self-closing", "
    ", true}, + {"with attributes", `
    `, true}, + {"plain text", "no html here", false}, + {"angle brackets but not HTML", "5 < 10 and 20 > 15", false}, + {"XML-like", "content", true}, + {"just less-than", "a < b", false}, + {"html entity only", "& <", false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := looksLikeHTML([]byte(tt.data)); got != tt.want { + t.Errorf("looksLikeHTML(%q) = %v, want %v", tt.data, got, tt.want) + } + }) + } +} diff --git a/pkg/feature/feature.go b/pkg/feature/feature.go index 080788c0218c..3aa92a3759a6 100644 --- a/pkg/feature/feature.go +++ b/pkg/feature/feature.go @@ -15,6 +15,7 @@ var ( UseGitMirror atomic.Bool GitlabProjectsPerPage atomic.Int64 UseGithubGraphQLAPI atomic.Bool // use github graphql api to fetch issues, pr's and comments + HTMLDecoderEnabled atomic.Bool ) type AtomicString struct { diff --git a/pkg/pb/detectorspb/detectors.pb.go b/pkg/pb/detectorspb/detectors.pb.go index 3ec2468eb95d..b942e1d75dc7 100644 --- a/pkg/pb/detectorspb/detectors.pb.go +++ b/pkg/pb/detectorspb/detectors.pb.go @@ -28,6 +28,7 @@ const ( DecoderType_BASE64 DecoderType = 2 DecoderType_UTF16 DecoderType = 3 DecoderType_ESCAPED_UNICODE DecoderType = 4 + DecoderType_HTML DecoderType = 5 ) // Enum value maps for DecoderType. @@ -38,6 +39,7 @@ var ( 2: "BASE64", 3: "UTF16", 4: "ESCAPED_UNICODE", + 5: "HTML", } DecoderType_value = map[string]int32{ "UNKNOWN": 0, @@ -45,6 +47,7 @@ var ( "BASE64": 2, "UTF16": 3, "ESCAPED_UNICODE": 4, + "HTML": 5, } ) diff --git a/proto/detectors.proto b/proto/detectors.proto index 88829dd17d3c..076b848a6c09 100644 --- a/proto/detectors.proto +++ b/proto/detectors.proto @@ -10,6 +10,7 @@ enum DecoderType { BASE64 = 2; UTF16 = 3; ESCAPED_UNICODE = 4; + HTML = 5; } enum DetectorType {