From cd28c03e4da0d591d18b0f3123e0a6fd8dffdacb Mon Sep 17 00:00:00 2001
From: Your Name <drew.lafiandra@trufflesec.com>
Date: Wed, 25 Mar 2026 13:37:25 -0700
Subject: [PATCH 1/5] Add HTML decoder for secret detection in HTML-formatted
 sources

Sources like MS Teams and Confluence emit HTML rather than plain text,
causing secrets split across tags or embedded in attributes to be missed.
This adds an HTML decoder to the pipeline that extracts text nodes,
high-signal attribute values, script/style/comment content, and code blocks.
It handles syntax-highlight boundary detection, zero-width character stripping,
and double-encoded HTML entity decoding.

Made-with: Cursor
---
 pkg/decoders/decoders.go           |   1 +
 pkg/decoders/html.go               | 251 +++++++++++++++++
 pkg/decoders/html_test.go          | 433 +++++++++++++++++++++++++++++
 pkg/pb/detectorspb/detectors.pb.go |   3 +
 proto/detectors.proto              |   1 +
 5 files changed, 689 insertions(+)
 create mode 100644 pkg/decoders/html.go
 create mode 100644 pkg/decoders/html_test.go

diff --git a/pkg/decoders/decoders.go b/pkg/decoders/decoders.go
index c49eee403ff2..3cac3c36c833 100644
--- a/pkg/decoders/decoders.go
+++ b/pkg/decoders/decoders.go
@@ -12,6 +12,7 @@ func DefaultDecoders() []Decoder {
 		&Base64{},
 		&UTF16{},
 		&EscapedUnicode{},
+		&HTML{},
 	}
 }
 
diff --git a/pkg/decoders/html.go b/pkg/decoders/html.go
new file mode 100644
index 000000000000..c3f782ed8f3a
--- /dev/null
+++ b/pkg/decoders/html.go
@@ -0,0 +1,251 @@
+package decoders
+
+import (
+	"bytes"
+	"net/url"
+	"regexp"
+	"strings"
+
+	"golang.org/x/net/html"
+
+	"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
+	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
+)
+
+// HTML is a decoder that extracts textual content from HTML documents.
+// It produces a normalized view containing visible text, attribute values,
+// script/style content, and HTML comments with entities and URL-encoding decoded.
+type HTML struct {
+	// Enabled controls whether the decoder is active. When nil, the decoder
+	// is always active. Inject a function that checks a feature flag to
+	// allow dynamic toggling without restarting the scanner.
+	Enabled func() bool
+}
+
+func (d *HTML) Type() detectorspb.DecoderType {
+	return detectorspb.DecoderType_HTML
+}
+
+var htmlTagPattern = regexp.MustCompile(`<[a-zA-Z][a-zA-Z0-9]*[\s>/]`)
+
+// highSignalAttrs are attribute names whose values are extracted into the
+// decoded output because they commonly contain URLs, tokens, or other secrets.
+var highSignalAttrs = map[string]bool{
+	"href":       true,
+	"src":        true,
+	"action":     true,
+	"value":      true,
+	"content":    true,
+	"alt":        true,
+	"title":      true,
+	"xlink:href": true,
+}
+
+// syntaxHighlightPrefixes lists CSS class prefixes used by syntax highlighting
+// libraries. Elements with these classes mark logical line boundaries in code
+// blocks where the platform (e.g. Teams) strips actual newlines.
+var syntaxHighlightPrefixes = []string{"hljs-"}
+
+// residualEntityReplacer decodes common HTML entities that survive double-encoding.
+// When content is entity-encoded twice (e.g. &amp;amp;), the parser's first pass
+// leaves residual entity sequences that this replacer cleans up.
+var residualEntityReplacer = strings.NewReplacer(
+	"&amp;", "&",
+	"&lt;", "<",
+	"&gt;", ">",
+	"&quot;", `"`,
+	"&#39;", "'",
+	"&apos;", "'",
+)
+
+// invisibleReplacer strips zero-width and invisible Unicode codepoints that
+// rich text editors may insert between characters, breaking detector regexes.
+var invisibleReplacer = strings.NewReplacer(
+	"\u200B", "", // zero-width space
+	"\u200C", "", // zero-width non-joiner
+	"\u200D", "", // zero-width joiner
+	"\uFEFF", "", // byte order mark / zero-width no-break space
+	"\u00AD", "", // soft hyphen
+	"\u2060", "", // word joiner
+	"\u200E", "", // left-to-right mark
+	"\u200F", "", // right-to-left mark
+)
+
+// blockElements insert newline boundaries when encountered during extraction.
+var blockElements = map[string]bool{
+	"p": true, "div": true, "br": true, "hr": true,
+	"h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true,
+	"li": true, "ol": true, "ul": true,
+	"tr": true, "td": true, "th": true, "table": true, "thead": true, "tbody": true, "tfoot": true,
+	"blockquote": true, "section": true, "article": true, "header": true, "footer": true,
+	"pre": true, "address": true, "figcaption": true, "figure": true,
+	"details": true, "summary": true, "main": true, "nav": true, "aside": true,
+	"form": true, "fieldset": true, "legend": true,
+	"dd": true, "dt": true, "dl": true,
+}
+
+func (d *HTML) FromChunk(chunk *sources.Chunk) *DecodableChunk {
+	if d.Enabled != nil && !d.Enabled() {
+		return nil
+	}
+	if chunk == nil || len(chunk.Data) == 0 {
+		return nil
+	}
+
+	if !looksLikeHTML(chunk.Data) {
+		return nil
+	}
+
+	extracted := extractHTML(chunk.Data)
+	if len(extracted) == 0 {
+		return nil
+	}
+
+	if bytes.Equal(chunk.Data, extracted) {
+		return nil
+	}
+
+	chunk.Data = extracted
+	return &DecodableChunk{Chunk: chunk, DecoderType: d.Type()}
+}
+
+func looksLikeHTML(data []byte) bool {
+	return htmlTagPattern.Match(data)
+}
+
+func extractHTML(data []byte) []byte {
+	doc, err := html.Parse(bytes.NewReader(data))
+	if err != nil {
+		return nil
+	}
+
+	var buf bytes.Buffer
+	buf.Grow(len(data))
+
+	walkNode(&buf, doc)
+
+	result := stripInvisible(buf.Bytes())
+	result = decodeResidualEntities(result)
+	return normalizeWhitespace(result)
+}
+
+func walkNode(buf *bytes.Buffer, n *html.Node) {
+	switch n.Type {
+	case html.TextNode:
+		text := n.Data
+		if text != "" {
+			buf.WriteString(text)
+		}
+
+	case html.CommentNode:
+		if content := strings.TrimSpace(n.Data); content != "" {
+			ensureNewline(buf)
+			buf.WriteString(content)
+			ensureNewline(buf)
+		}
+
+	case html.ElementNode:
+		isBlock := blockElements[n.Data]
+
+		if isBlock {
+			ensureNewline(buf)
+		} else if hasSyntaxHighlightClass(n) {
+			ensureNewline(buf)
+		}
+
+		emitAttributes(buf, n)
+
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			walkNode(buf, c)
+		}
+
+		if isBlock || n.Data == "br" {
+			ensureNewline(buf)
+		}
+
+	default:
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			walkNode(buf, c)
+		}
+	}
+}
+
+func hasSyntaxHighlightClass(n *html.Node) bool {
+	for _, attr := range n.Attr {
+		if attr.Key != "class" {
+			continue
+		}
+		for _, cls := range strings.Fields(attr.Val) {
+			for _, prefix := range syntaxHighlightPrefixes {
+				if strings.HasPrefix(cls, prefix) {
+					return true
+				}
+			}
+		}
+	}
+	return false
+}
+
+func emitAttributes(buf *bytes.Buffer, n *html.Node) {
+	for _, attr := range n.Attr {
+		isDataAttr := strings.HasPrefix(attr.Key, "data-")
+		if !highSignalAttrs[attr.Key] && !isDataAttr {
+			continue
+		}
+		val := strings.TrimSpace(attr.Val)
+		if val == "" || val == "#" {
+			continue
+		}
+		decoded, err := url.QueryUnescape(val)
+		if err == nil && decoded != val {
+			val = decoded
+		}
+		ensureNewline(buf)
+		buf.WriteString(val)
+		ensureNewline(buf)
+	}
+}
+
+func ensureNewline(buf *bytes.Buffer) {
+	if buf.Len() == 0 {
+		return
+	}
+	if buf.Bytes()[buf.Len()-1] != '\n' {
+		buf.WriteByte('\n')
+	}
+}
+
+func stripInvisible(data []byte) []byte {
+	return []byte(invisibleReplacer.Replace(string(data)))
+}
+
+func decodeResidualEntities(data []byte) []byte {
+	s := string(data)
+	decoded := residualEntityReplacer.Replace(s)
+	if decoded == s {
+		return data
+	}
+	return []byte(decoded)
+}
+
+// normalizeWhitespace collapses runs of blank lines and trims leading/trailing whitespace.
+func normalizeWhitespace(data []byte) []byte {
+	lines := bytes.Split(data, []byte("\n"))
+	var result [][]byte
+	prevBlank := true
+	for _, line := range lines {
+		trimmed := bytes.TrimSpace(line)
+		if len(trimmed) == 0 {
+			if !prevBlank {
+				prevBlank = true
+			}
+			continue
+		}
+		if prevBlank && len(result) > 0 {
+			result = append(result, []byte(""))
+		}
+		result = append(result, trimmed)
+		prevBlank = false
+	}
+	return bytes.Join(result, []byte("\n"))
+}
diff --git a/pkg/decoders/html_test.go b/pkg/decoders/html_test.go
new file mode 100644
index 000000000000..8d9245851bd9
--- /dev/null
+++ b/pkg/decoders/html_test.go
@@ -0,0 +1,433 @@
+package decoders
+
+import (
+	"testing"
+
+	"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
+	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
+)
+
+func TestHTML_Type(t *testing.T) {
+	d := &HTML{}
+	if got := d.Type(); got != detectorspb.DecoderType_HTML {
+		t.Errorf("Type() = %v, want %v", got, detectorspb.DecoderType_HTML)
+	}
+}
+
+// TestHTML_FromChunk verifies the HTML decoder extracts secrets from HTML content
+// that sources like MS Teams and Confluence emit. The test cases are grouped by
+// the category of extraction they exercise:
+//
+//   - Guard clauses: nil, empty, and non-HTML input return nil.
+//   - Text node extraction: secrets split across inline tags are rejoined;
+//     HTML entities (&amp;) are decoded by the parser.
+//   - Attribute value extraction: high-signal attrs (href, src, data-*, value,
+//     content, alt, title, action) are emitted; URL percent-encoding is decoded;
+//     empty/anchor-only hrefs are skipped.
+//   - Script / style / comment content: all included because they frequently
+//     contain embedded credentials.
+//   - Code and pre blocks: preserved verbatim (common secret location).
+//   - Whitespace and token boundaries: block elements (p, div, br, tr, td, li)
+//     insert newlines; inline elements preserve text continuity to avoid
+//     accidental token joins.
+//   - Real-world formats: Confluence storage-format HTML and Teams message HTML
+//     with secrets in typical positions.
+//   - Integration: a mixed-content case exercises text nodes, URL-decoded attrs,
+//     script content, and HTML comments in a single chunk.
+func TestHTML_FromChunk(t *testing.T) {
+	tests := []struct {
+		name    string
+		chunk   *sources.Chunk
+		want    string
+		wantNil bool
+	}{
+		// --- Guard clauses: decoder returns nil for non-applicable input ---
+		{
+			name:    "nil chunk",
+			chunk:   nil,
+			wantNil: true,
+		},
+		{
+			name:    "empty data",
+			chunk:   &sources.Chunk{Data: []byte{}},
+			wantNil: true,
+		},
+		{
+			name:    "plain text (no HTML)",
+			chunk:   &sources.Chunk{Data: []byte("just some plain text with no tags")},
+			wantNil: true,
+		},
+
+		// --- Text node extraction ---
+		{
+			// Core scenario: a secret is split across formatting tags by the
+			// rich-text editor. The parser concatenates adjacent text nodes.
+			name:  "secret split across span tags",
+			chunk: &sources.Chunk{Data: []byte(`<p><span>AKIA</span><span>1234567890ABCDEF</span></p>`)},
+			want:  "AKIA1234567890ABCDEF",
+		},
+		{
+			// Confluence/Teams encode '&' as '&amp;'. The HTML parser
+			// automatically unescapes entities so detector regexes can match.
+			name:  "HTML entities decoded",
+			chunk: &sources.Chunk{Data: []byte(`<p>key=abc&amp;secret=hunter2</p>`)},
+			want:  "key=abc&secret=hunter2",
+		},
+
+		// --- Attribute value extraction ---
+		{
+			// Secrets in href URLs (e.g. tokens in query params).
+			name:  "attribute value extraction - href",
+			chunk: &sources.Chunk{Data: []byte(`<a href="https://api.example.com?token=sk-live-1234">link</a>`)},
+			want:  "https://api.example.com?token=sk-live-1234\nlink",
+		},
+		{
+			// Secrets in src URLs (e.g. image CDN tokens).
+			name:  "attribute value extraction - src",
+			chunk: &sources.Chunk{Data: []byte(`<img src="https://cdn.example.com/img?key=secret123"/>`)},
+			want:  "https://cdn.example.com/img?key=secret123",
+		},
+		{
+			// Percent-encoded characters in attribute values (%2D -> '-',
+			// %5F -> '_') are decoded so detectors see the actual secret.
+			name:  "URL-encoded attribute values decoded",
+			chunk: &sources.Chunk{Data: []byte(`<a href="https://api.example.com?token=sk%2Dlive%5F1234">docs</a>`)},
+			want:  "https://api.example.com?token=sk-live_1234\ndocs",
+		},
+		{
+			// data-* attributes are often used for JS-consumed config values.
+			name:  "data-* attributes extracted",
+			chunk: &sources.Chunk{Data: []byte(`<div data-api-key="ghp_abc123def456">content</div>`)},
+			want:  "ghp_abc123def456\ncontent",
+		},
+		{
+			// src, alt, and title on a single element all extracted.
+			name:  "multiple high-signal attributes on one element",
+			chunk: &sources.Chunk{Data: []byte(`<img src="https://api.com/img?key=k1" alt="secret: abc123" title="token: def456"/>`)},
+			want:  "https://api.com/img?key=k1\nsecret: abc123\ntoken: def456",
+		},
+		{
+			// Anchors with href="#" carry no signal and are skipped.
+			name:  "empty href skipped",
+			chunk: &sources.Chunk{Data: []byte(`<a href="#">click</a>`)},
+			want:  "click",
+		},
+		{
+			// Hidden inputs often carry CSRF tokens or API keys.
+			name:  "value attribute on input",
+			chunk: &sources.Chunk{Data: []byte(`<input type="hidden" value="sk_test_EXAMPLEKEYEXAMPLEKEYEX"/>`)},
+			want:  "sk_test_EXAMPLEKEYEXAMPLEKEYEX",
+		},
+		{
+			// <meta> content attributes may carry API keys for client-side SDKs.
+			name:  "meta content attribute",
+			chunk: &sources.Chunk{Data: []byte(`<meta name="api-key" content="pk_live_abcdefghij1234567890"/>`)},
+			want:  "pk_live_abcdefghij1234567890",
+		},
+		{
+			// Form action URLs can embed secrets in query strings.
+			name:  "action attribute on form",
+			chunk: &sources.Chunk{Data: []byte(`<form action="https://api.stripe.com/v1/charges?key=sk_live_123"><button>Pay</button></form>`)},
+			want:  "https://api.stripe.com/v1/charges?key=sk_live_123\nPay",
+		},
+
+		// --- Script / style / comment content ---
+		{
+			// Inline <script> blocks frequently contain API keys, tokens,
+			// and configuration objects with secrets.
+			name:  "script content included",
+			chunk: &sources.Chunk{Data: []byte(`<p>hello</p><script>var secret = "ghp_abc123def456";</script>`)},
+			want:  "hello\nvar secret = \"ghp_abc123def456\";",
+		},
+		{
+			// CSS can contain secrets in background-image URLs, @import, etc.
+			name:  "style content included",
+			chunk: &sources.Chunk{Data: []byte(`<p>text</p><style>body { background: url("https://cdn.com?key=secret"); }</style>`)},
+			want:  "text\nbody { background: url(\"https://cdn.com?key=secret\"); }",
+		},
+		{
+			// HTML comments are a common place for debug credentials and
+			// TODO notes with hardcoded passwords.
+			name:  "HTML comment content included",
+			chunk: &sources.Chunk{Data: []byte(`<p>visible</p><!-- TODO: remove hardcoded password=hunter2 -->`)},
+			want:  "visible\nTODO: remove hardcoded password=hunter2",
+		},
+
+		// --- Code and pre blocks ---
+		{
+			// <pre>/<code> content is preserved verbatim; these blocks are a
+			// top location for pasted credentials and key exports.
+			name:  "code/pre blocks preserved",
+			chunk: &sources.Chunk{Data: []byte(`<pre><code>export AWS_SECRET_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY</code></pre>`)},
+			want:  "export AWS_SECRET_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
+		},
+		{
+			// Multi-line PEM private keys in <pre> blocks with <br> line breaks
+			// are reconstructed with proper newlines for detector matching.
+			name:  "private key in pre block",
+			chunk: &sources.Chunk{Data: []byte(`<pre>-----BEGIN RSA PRIVATE KEY-----<br>MIIEpAIBAAKCAQEA04up8h<br>-----END RSA PRIVATE KEY-----</pre>`)},
+			want:  "-----BEGIN RSA PRIVATE KEY-----\nMIIEpAIBAAKCAQEA04up8h\n-----END RSA PRIVATE KEY-----",
+		},
+
+		// --- Whitespace and token boundaries ---
+		{
+			// Block elements (<p>) produce newline boundaries so adjacent
+			// paragraphs don't merge tokens.
+			name: "block elements produce newlines",
+			chunk: &sources.Chunk{Data: []byte(`<div><p>first</p><p>second</p></div>`)},
+			want:  "first\nsecond",
+		},
+		{
+			// All <br> variants produce newlines.
+			name:  "br tags produce newlines",
+			chunk: &sources.Chunk{Data: []byte(`<p>line1<br>line2<br/>line3</p>`)},
+			want:  "line1\nline2\nline3",
+		},
+		{
+			// Nested inline elements (<em>, <strong>) do not break the token;
+			// text flows continuously so "token=sk-live-abc123" stays intact.
+			name:  "nested inline elements preserve text continuity",
+			chunk: &sources.Chunk{Data: []byte(`<p>token=<em><strong>sk-live-abc123</strong></em></p>`)},
+			want:  "token=sk-live-abc123",
+		},
+		{
+			// <td> elements are block-level: each cell gets its own line,
+			// keeping key/value pairs from merging.
+			name: "table with secrets",
+			chunk: &sources.Chunk{Data: []byte(
+				`<table><tr><td>API Key</td><td>AKIAIOSFODNN7EXAMPLE</td></tr>` +
+					`<tr><td>Secret</td><td>wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY</td></tr></table>`,
+			)},
+			want: "API Key\nAKIAIOSFODNN7EXAMPLE\nSecret\nwJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
+		},
+		{
+			// Even without <tr> wrappers, <td> still inserts block boundaries.
+			name: "td cells without enclosing tr still get block boundaries",
+			chunk: &sources.Chunk{Data: []byte(
+				`<table><td>key</td><td>value</td></table>`,
+			)},
+			want: "key\nvalue",
+		},
+		{
+			// <li> elements produce separate lines.
+			name: "list items produce separate lines",
+			chunk: &sources.Chunk{Data: []byte(
+				`<ul><li>token: abc123</li><li>secret: def456</li></ul>`,
+			)},
+			want: "token: abc123\nsecret: def456",
+		},
+
+		// --- Real-world source formats ---
+		{
+			// Confluence storage format: secrets split across <strong> tags,
+			// an AWS key in plain text, and an href with a URL. Exercises text
+			// node concatenation, attribute extraction, and block boundaries
+			// together.
+			name: "confluence storage format - real world",
+			chunk: &sources.Chunk{Data: []byte(
+				`<p>Our API credentials:</p>` +
+					`<p>Key: <strong>AKIA</strong><strong>IOSFODNN7EXAMPLE</strong></p>` +
+					`<p>Secret: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY</p>` +
+					`<p>See <a href="https://console.aws.amazon.com">AWS Console</a></p>`,
+			)},
+			want: "Our API credentials:\nKey: AKIAIOSFODNN7EXAMPLE\nSecret: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nSee\nhttps://console.aws.amazon.com\nAWS Console",
+		},
+		{
+			// Teams message HTML: nested <div> wrappers around <p> tags
+			// containing a GitHub PAT. Verifies that redundant block wrappers
+			// collapse to clean newlines.
+			name: "teams message HTML - real world",
+			chunk: &sources.Chunk{Data: []byte(
+				`<div><div>` +
+					`<p>Here is the token for the staging env:</p>` +
+					`<p>ghp_ABCDEFghijklmnop1234567890abcde</p>` +
+					`</div></div>`,
+			)},
+			want: "Here is the token for the staging env:\nghp_ABCDEFghijklmnop1234567890abcde",
+		},
+
+		// --- Syntax highlight boundary detection ---
+		{
+			// Teams renders code blocks as adjacent <span> elements within a
+			// single <p>, using highlight.js classes for syntax coloring.
+			// Newlines from the original code are lost. The decoder detects
+			// hljs-* classes and inserts newlines at those boundaries while
+			// still concatenating non-hljs sibling spans (preserving
+			// mid-token color splits like the value below split across 3 spans).
+			name: "teams code block with hljs syntax highlighting",
+			chunk: &sources.Chunk{Data: []byte(
+				`<p>` +
+					`<span style="color:#1E53A3"><strong>[header]</strong></span>` +
+					`<span class="hljs-function" style="color:#1E53A3">key_one</span>` +
+					`<span> = FIRST_VALUE_ABCDEFGH</span>` +
+					`<span class="hljs-function" style="color:#1E53A3">key_two</span>` +
+					`<span> = SECOND_VAL_PART_</span>` +
+					`<span style="color:#1E53A3">X</span>` +
+					`<span>_END_OF_VALUE</span>` +
+					`<span class="hljs-function" style="color:#1E53A3">format</span>` +
+					`<span> = json</span>` +
+					`</p>`,
+			)},
+			want: "[header]\nkey_one = FIRST_VALUE_ABCDEFGH\nkey_two = SECOND_VAL_PART_X_END_OF_VALUE\nformat = json",
+		},
+		{
+			// Spans without hljs classes must still concatenate, preserving
+			// the existing split-secret behavior even when hljs spans are
+			// present elsewhere in the document.
+			name: "non-hljs sibling spans still concatenate",
+			chunk: &sources.Chunk{Data: []byte(
+				`<p><span style="color:red">SECRET_</span><span>FIRST_HALF_1234</span></p>`,
+			)},
+			want: "SECRET_FIRST_HALF_1234",
+		},
+		{
+			// Various hljs-* class names (not just hljs-function) should
+			// all trigger line boundaries.
+			name: "multiple hljs class variants trigger boundaries",
+			chunk: &sources.Chunk{Data: []byte(
+				`<p>` +
+					`<span class="hljs-keyword">const</span>` +
+					`<span> x = </span>` +
+					`<span class="hljs-string">"value_one"</span>` +
+					`<span class="hljs-keyword">const</span>` +
+					`<span> y = </span>` +
+					`<span class="hljs-string">"value_two"</span>` +
+					`</p>`,
+			)},
+			want: "const x =\n\"value_one\"\nconst y =\n\"value_two\"",
+		},
+
+		// --- Zero-width / invisible character stripping ---
+		{
+			// Zero-width spaces inserted between characters by rich text editors
+			// are stripped so detector regexes can match the full token.
+			name: "zero-width space stripped from secret",
+			chunk: &sources.Chunk{Data: []byte("<p>TOKEN_\u200BABCDEF_1234</p>")},
+			want: "TOKEN_ABCDEF_1234",
+		},
+		{
+			// Multiple invisible codepoint types mixed into a single token.
+			name: "multiple invisible character types stripped",
+			chunk: &sources.Chunk{Data: []byte("<p>SECRET\u200C_VALUE\u00AD_HERE\u2060_END\uFEFF</p>")},
+			want: "SECRET_VALUE_HERE_END",
+		},
+
+		// --- SVG xlink:href attribute extraction ---
+		{
+			// SVG elements use xlink:href for URLs which may contain tokens.
+			name: "xlink:href extracted from SVG element",
+			chunk: &sources.Chunk{Data: []byte(`<svg><a xlink:href="https://api.example.com?token=secret_value_123">icon</a></svg>`)},
+			want: "https://api.example.com?token=secret_value_123\nicon",
+		},
+
+		// --- Double-encoded HTML entity decoding ---
+		{
+			// Content double-encoded as &amp;amp; becomes &amp; after the parser's
+			// first pass; the residual entity replacer decodes it to &.
+			name: "double-encoded ampersand decoded",
+			chunk: &sources.Chunk{Data: []byte(`<p>key=abc&amp;amp;secret=val</p>`)},
+			want: "key=abc&secret=val",
+		},
+		{
+			// Single-encoded entities are handled by the parser; verify the
+			// residual replacer does not corrupt already-decoded content.
+			name: "single-encoded entities not double-decoded",
+			chunk: &sources.Chunk{Data: []byte(`<p>5 &gt; 3 &amp; 2 &lt; 4</p>`)},
+			want: "5 > 3 & 2 < 4",
+		},
+
+		// --- Integration: all extraction types in one chunk ---
+		{
+			// Combines text nodes (split across spans), URL-decoded attribute
+			// values, inline script content, and an HTML comment -- all in a
+			// single chunk. Verifies the decoder handles the full extraction
+			// surface simultaneously.
+			name: "mixed content with all extraction types",
+			chunk: &sources.Chunk{Data: []byte(
+				`<p>API key: <span style="color:red">AKIA</span><span>1234567890ABCDEF</span></p>` +
+					`<p>See <a href="https://api.example.com?token=sk%2Dlive%5F1234">docs</a></p>` +
+					`<script>var secret = "ghp_abc123def456";</script>` +
+					`<!-- TODO: remove hardcoded password=hunter2 -->`,
+			)},
+			want: "API key: AKIA1234567890ABCDEF\nSee\nhttps://api.example.com?token=sk-live_1234\ndocs\nvar secret = \"ghp_abc123def456\";\nTODO: remove hardcoded password=hunter2",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			d := &HTML{}
+			got := d.FromChunk(tt.chunk)
+
+			if tt.wantNil {
+				if got != nil {
+					t.Errorf("FromChunk() = %q, want nil", string(got.Chunk.Data))
+				}
+				return
+			}
+
+			if got == nil {
+				t.Fatalf("FromChunk() returned nil, want %q", tt.want)
+			}
+			if got.DecoderType != detectorspb.DecoderType_HTML {
+				t.Errorf("DecoderType = %v, want %v", got.DecoderType, detectorspb.DecoderType_HTML)
+			}
+			if string(got.Chunk.Data) != tt.want {
+				t.Errorf("FromChunk() data =\n%q\nwant:\n%q", string(got.Chunk.Data), tt.want)
+			}
+		})
+	}
+}
+
+// TestHTML_FeatureFlagDisabled verifies that the decoder is a no-op when the
+// Enabled callback returns false, allowing the HTMLDecoder feature flag to
+// gate the decoder at runtime without removing it from the pipeline.
+func TestHTML_FeatureFlagDisabled(t *testing.T) {
+	d := &HTML{Enabled: func() bool { return false }}
+	chunk := &sources.Chunk{Data: []byte(`<p>secret: hunter2</p>`)}
+	if got := d.FromChunk(chunk); got != nil {
+		t.Errorf("FromChunk() should return nil when disabled, got %q", string(got.Chunk.Data))
+	}
+}
+
+// TestHTML_FeatureFlagEnabled verifies that the decoder processes HTML normally
+// when the Enabled callback returns true.
+func TestHTML_FeatureFlagEnabled(t *testing.T) {
+	d := &HTML{Enabled: func() bool { return true }}
+	chunk := &sources.Chunk{Data: []byte(`<p>secret: hunter2</p>`)}
+	got := d.FromChunk(chunk)
+	if got == nil {
+		t.Fatal("FromChunk() returned nil, want decoded chunk")
+	}
+	if string(got.Chunk.Data) != "secret: hunter2" {
+		t.Errorf("FromChunk() data = %q, want %q", string(got.Chunk.Data), "secret: hunter2")
+	}
+}
+
+// TestLooksLikeHTML verifies the fast heuristic that decides whether chunk data
+// is worth parsing as HTML. It must accept valid HTML tags (including self-closing
+// and attribute-bearing) while rejecting plain text, arithmetic comparisons, and
+// bare HTML entities -- all of which could appear in non-HTML source content.
+func TestLooksLikeHTML(t *testing.T) {
+	tests := []struct {
+		name string
+		data string
+		want bool
+	}{
+		{"simple tag", "<p>hello</p>", true},
+		{"self-closing", "<br/>", true},
+		{"with attributes", `<div class="foo">`, true},
+		{"plain text", "no html here", false},
+		{"angle brackets but not HTML", "5 < 10 and 20 > 15", false},
+		{"XML-like", "<root>content</root>", true},
+		{"just less-than", "a < b", false},
+		{"html entity only", "&amp; &lt;", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := looksLikeHTML([]byte(tt.data)); got != tt.want {
+				t.Errorf("looksLikeHTML(%q) = %v, want %v", tt.data, got, tt.want)
+			}
+		})
+	}
+}
diff --git a/pkg/pb/detectorspb/detectors.pb.go b/pkg/pb/detectorspb/detectors.pb.go
index 3ec2468eb95d..b942e1d75dc7 100644
--- a/pkg/pb/detectorspb/detectors.pb.go
+++ b/pkg/pb/detectorspb/detectors.pb.go
@@ -28,6 +28,7 @@ const (
 	DecoderType_BASE64          DecoderType = 2
 	DecoderType_UTF16           DecoderType = 3
 	DecoderType_ESCAPED_UNICODE DecoderType = 4
+	DecoderType_HTML            DecoderType = 5
 )
 
 // Enum value maps for DecoderType.
@@ -38,6 +39,7 @@ var (
 		2: "BASE64",
 		3: "UTF16",
 		4: "ESCAPED_UNICODE",
+		5: "HTML",
 	}
 	DecoderType_value = map[string]int32{
 		"UNKNOWN":         0,
@@ -45,6 +47,7 @@ var (
 		"BASE64":          2,
 		"UTF16":           3,
 		"ESCAPED_UNICODE": 4,
+		"HTML":            5,
 	}
 )
 
diff --git a/proto/detectors.proto b/proto/detectors.proto
index 88829dd17d3c..076b848a6c09 100644
--- a/proto/detectors.proto
+++ b/proto/detectors.proto
@@ -10,6 +10,7 @@ enum DecoderType {
   BASE64 = 2;
   UTF16 = 3;
   ESCAPED_UNICODE = 4;
+  HTML = 5;
 }
 
 enum DetectorType {

From e4330922d83a1f6a897fed617a41f7163054c740 Mon Sep 17 00:00:00 2001
From: Your Name <drew.lafiandra@trufflesec.com>
Date: Wed, 1 Apr 2026 13:01:46 -0700
Subject: [PATCH 2/5] Fix dead code and plus-sign corruption in HTML decoder

- Remove unreachable "xlink:href" map entry: the html parser splits
  namespace-prefixed attributes into separate Namespace/Key fields,
  so attr.Key is "href" (already in the map), never "xlink:href".
- Switch url.QueryUnescape to url.PathUnescape: QueryUnescape converts
  '+' to space per form-encoding spec, corrupting secrets that contain
  literal '+' characters (e.g. base64 values, API keys).

Made-with: Cursor
---
 pkg/decoders/html.go      | 7 +++----
 pkg/decoders/html_test.go | 8 ++++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/pkg/decoders/html.go b/pkg/decoders/html.go
index c3f782ed8f3a..016bacb0f87d 100644
--- a/pkg/decoders/html.go
+++ b/pkg/decoders/html.go
@@ -36,9 +36,8 @@ var highSignalAttrs = map[string]bool{
 	"action":     true,
 	"value":      true,
 	"content":    true,
-	"alt":        true,
-	"title":      true,
-	"xlink:href": true,
+	"alt":   true,
+	"title": true,
 }
 
 // syntaxHighlightPrefixes lists CSS class prefixes used by syntax highlighting
@@ -196,7 +195,7 @@ func emitAttributes(buf *bytes.Buffer, n *html.Node) {
 		if val == "" || val == "#" {
 			continue
 		}
-		decoded, err := url.QueryUnescape(val)
+		decoded, err := url.PathUnescape(val)
 		if err == nil && decoded != val {
 			val = decoded
 		}
diff --git a/pkg/decoders/html_test.go b/pkg/decoders/html_test.go
index 8d9245851bd9..6a8205555520 100644
--- a/pkg/decoders/html_test.go
+++ b/pkg/decoders/html_test.go
@@ -131,6 +131,14 @@ func TestHTML_FromChunk(t *testing.T) {
 			want:  "https://api.stripe.com/v1/charges?key=sk_live_123\nPay",
 		},
 
+		{
+			// '+' is a literal character in attribute values (not a space).
+			// PathUnescape preserves it while still decoding %XX sequences.
+			name:  "plus sign preserved in attribute value",
+			chunk: &sources.Chunk{Data: []byte(`<input type="hidden" value="sk_test_abc+def/123"/>`)},
+			want:  "sk_test_abc+def/123",
+		},
+
 		// --- Script / style / comment content ---
 		{
 			// Inline <script> blocks frequently contain API keys, tokens,

From 38a176316bf444155f0aeff7b6780ac64afe3d55 Mon Sep 17 00:00:00 2001
From: Your Name <drew.lafiandra@trufflesec.com>
Date: Thu, 2 Apr 2026 14:43:52 -0700
Subject: [PATCH 3/5] updated comment around syntaxHighlightPrefixes to guide
 future additions

---
 pkg/decoders/html.go | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/pkg/decoders/html.go b/pkg/decoders/html.go
index 016bacb0f87d..cda734939138 100644
--- a/pkg/decoders/html.go
+++ b/pkg/decoders/html.go
@@ -31,18 +31,19 @@ var htmlTagPattern = regexp.MustCompile(`<[a-zA-Z][a-zA-Z0-9]*[\s>/]`)
 // highSignalAttrs are attribute names whose values are extracted into the
 // decoded output because they commonly contain URLs, tokens, or other secrets.
 var highSignalAttrs = map[string]bool{
-	"href":       true,
-	"src":        true,
-	"action":     true,
-	"value":      true,
-	"content":    true,
-	"alt":   true,
-	"title": true,
+	"href":    true,
+	"src":     true,
+	"action":  true,
+	"value":   true,
+	"content": true,
+	"alt":     true,
+	"title":   true,
 }
 
-// syntaxHighlightPrefixes lists CSS class prefixes used by syntax highlighting
-// libraries. Elements with these classes mark logical line boundaries in code
-// blocks where the platform (e.g. Teams) strips actual newlines.
+// syntaxHighlightPrefixes lists CSS class prefixes that mark logical line
+// boundaries in code blocks. MS Teams uses highlight.js (hljs-* classes) to
+// render code, stripping the original newlines in the process. Add prefixes
+// here if other sources use different syntax highlighting libraries.
 var syntaxHighlightPrefixes = []string{"hljs-"}
 
 // residualEntityReplacer decodes common HTML entities that survive double-encoding.

From 2a2997c519b2861581f938465905364d607fa93e Mon Sep 17 00:00:00 2001
From: Your Name <drew.lafiandra@trufflesec.com>
Date: Fri, 3 Apr 2026 06:51:59 -0700
Subject: [PATCH 4/5] removed Enabled func from HTML struct to follow normal
 flag conventions

---
 pkg/decoders/html.go      | 32 ++++++++++++++------------------
 pkg/decoders/html_test.go | 19 +++++++++++++------
 pkg/feature/feature.go    |  1 +
 3 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/pkg/decoders/html.go b/pkg/decoders/html.go
index cda734939138..424f894a7217 100644
--- a/pkg/decoders/html.go
+++ b/pkg/decoders/html.go
@@ -8,6 +8,7 @@ import (
 
 	"golang.org/x/net/html"
 
+	"github.com/trufflesecurity/trufflehog/v3/pkg/feature"
 	"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
 	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
 )
@@ -15,12 +16,8 @@ import (
 // HTML is a decoder that extracts textual content from HTML documents.
 // It produces a normalized view containing visible text, attribute values,
 // script/style content, and HTML comments with entities and URL-encoding decoded.
-type HTML struct {
-	// Enabled controls whether the decoder is active. When nil, the decoder
-	// is always active. Inject a function that checks a feature flag to
-	// allow dynamic toggling without restarting the scanner.
-	Enabled func() bool
-}
+// Gated at runtime by feature.HTMLDecoderEnabled.
+type HTML struct{}
 
 func (d *HTML) Type() detectorspb.DecoderType {
 	return detectorspb.DecoderType_HTML
@@ -31,19 +28,18 @@ var htmlTagPattern = regexp.MustCompile(`<[a-zA-Z][a-zA-Z0-9]*[\s>/]`)
 // highSignalAttrs are attribute names whose values are extracted into the
 // decoded output because they commonly contain URLs, tokens, or other secrets.
 var highSignalAttrs = map[string]bool{
-	"href":    true,
-	"src":     true,
-	"action":  true,
-	"value":   true,
-	"content": true,
-	"alt":     true,
-	"title":   true,
+	"href":       true,
+	"src":        true,
+	"action":     true,
+	"value":      true,
+	"content":    true,
+	"alt":   true,
+	"title": true,
 }
 
-// syntaxHighlightPrefixes lists CSS class prefixes that mark logical line
-// boundaries in code blocks. MS Teams uses highlight.js (hljs-* classes) to
-// render code, stripping the original newlines in the process. Add prefixes
-// here if other sources use different syntax highlighting libraries.
+// syntaxHighlightPrefixes lists CSS class prefixes used by syntax highlighting
+// libraries. Elements with these classes mark logical line boundaries in code
+// blocks where the platform (e.g. Teams) strips actual newlines.
 var syntaxHighlightPrefixes = []string{"hljs-"}
 
 // residualEntityReplacer decodes common HTML entities that survive double-encoding.
@@ -85,7 +81,7 @@ var blockElements = map[string]bool{
 }
 
 func (d *HTML) FromChunk(chunk *sources.Chunk) *DecodableChunk {
-	if d.Enabled != nil && !d.Enabled() {
+	if !feature.HTMLDecoderEnabled.Load() {
 		return nil
 	}
 	if chunk == nil || len(chunk.Data) == 0 {
diff --git a/pkg/decoders/html_test.go b/pkg/decoders/html_test.go
index 6a8205555520..e6a97f576ca4 100644
--- a/pkg/decoders/html_test.go
+++ b/pkg/decoders/html_test.go
@@ -3,6 +3,7 @@ package decoders
 import (
 	"testing"
 
+	"github.com/trufflesecurity/trufflehog/v3/pkg/feature"
 	"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
 	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
 )
@@ -363,6 +364,9 @@ func TestHTML_FromChunk(t *testing.T) {
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
+			feature.HTMLDecoderEnabled.Store(true)
+			defer feature.HTMLDecoderEnabled.Store(false)
+
 			d := &HTML{}
 			got := d.FromChunk(tt.chunk)
 
@@ -386,11 +390,11 @@ func TestHTML_FromChunk(t *testing.T) {
 	}
 }
 
-// TestHTML_FeatureFlagDisabled verifies that the decoder is a no-op when the
-// Enabled callback returns false, allowing the HTMLDecoder feature flag to
-// gate the decoder at runtime without removing it from the pipeline.
+// TestHTML_FeatureFlagDisabled verifies that the decoder is a no-op when
+// feature.HTMLDecoderEnabled is false.
 func TestHTML_FeatureFlagDisabled(t *testing.T) {
-	d := &HTML{Enabled: func() bool { return false }}
+	feature.HTMLDecoderEnabled.Store(false)
+	d := &HTML{}
 	chunk := &sources.Chunk{Data: []byte(`<p>secret: hunter2</p>`)}
 	if got := d.FromChunk(chunk); got != nil {
 		t.Errorf("FromChunk() should return nil when disabled, got %q", string(got.Chunk.Data))
@@ -398,9 +402,12 @@ func TestHTML_FeatureFlagDisabled(t *testing.T) {
 }
 
 // TestHTML_FeatureFlagEnabled verifies that the decoder processes HTML normally
-// when the Enabled callback returns true.
+// when feature.HTMLDecoderEnabled is true.
 func TestHTML_FeatureFlagEnabled(t *testing.T) {
-	d := &HTML{Enabled: func() bool { return true }}
+	feature.HTMLDecoderEnabled.Store(true)
+	defer feature.HTMLDecoderEnabled.Store(false)
+
+	d := &HTML{}
 	chunk := &sources.Chunk{Data: []byte(`<p>secret: hunter2</p>`)}
 	got := d.FromChunk(chunk)
 	if got == nil {
diff --git a/pkg/feature/feature.go b/pkg/feature/feature.go
index 080788c0218c..3aa92a3759a6 100644
--- a/pkg/feature/feature.go
+++ b/pkg/feature/feature.go
@@ -15,6 +15,7 @@ var (
 	UseGitMirror                   atomic.Bool
 	GitlabProjectsPerPage          atomic.Int64
 	UseGithubGraphQLAPI            atomic.Bool // use github graphql api to fetch issues, pr's and comments
+	HTMLDecoderEnabled             atomic.Bool
 )
 
 type AtomicString struct {

From c0d437a36102ec61ce4f684dc4fdd03f7aa74cc4 Mon Sep 17 00:00:00 2001
From: Your Name <drew.lafiandra@trufflesec.com>
Date: Mon, 6 Apr 2026 10:23:51 -0700
Subject: [PATCH 5/5] Fix script/style boundary, redundant br check, and
 raw-text entity corruption

- Add script/style to blockElements so they get newline boundaries
  instead of concatenating with adjacent inline text.
- Remove redundant `|| n.Data == "br"` since br is already in blockElements.
- Move residual entity decoding into walkNode per text node, skipping
  it for script/style raw-text content where the HTML parser does not
  decode entities.

Made-with: Cursor
---
 pkg/decoders/html.go      | 34 +++++++++++++++++++---------------
 pkg/decoders/html_test.go | 26 ++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/pkg/decoders/html.go b/pkg/decoders/html.go
index 424f894a7217..93e7366bd1c9 100644
--- a/pkg/decoders/html.go
+++ b/pkg/decoders/html.go
@@ -78,6 +78,16 @@ var blockElements = map[string]bool{
 	"details": true, "summary": true, "main": true, "nav": true, "aside": true,
 	"form": true, "fieldset": true, "legend": true,
 	"dd": true, "dt": true, "dl": true,
+	"script": true, "style": true,
+}
+
+// rawTextElements are elements whose content the HTML parser treats as raw
+// text (entities are NOT decoded). Residual entity decoding must be skipped
+// for text nodes inside these elements to avoid corrupting literal sequences
+// like &amp; in JavaScript.
+var rawTextElements = map[string]bool{
+	"script": true,
+	"style":  true,
 }
 
 func (d *HTML) FromChunk(chunk *sources.Chunk) *DecodableChunk {
@@ -118,18 +128,20 @@ func extractHTML(data []byte) []byte {
 	var buf bytes.Buffer
 	buf.Grow(len(data))
 
-	walkNode(&buf, doc)
+	walkNode(&buf, doc, false)
 
 	result := stripInvisible(buf.Bytes())
-	result = decodeResidualEntities(result)
 	return normalizeWhitespace(result)
 }
 
-func walkNode(buf *bytes.Buffer, n *html.Node) {
+func walkNode(buf *bytes.Buffer, n *html.Node, inRawText bool) {
 	switch n.Type {
 	case html.TextNode:
 		text := n.Data
 		if text != "" {
+			if !inRawText {
+				text = residualEntityReplacer.Replace(text)
+			}
 			buf.WriteString(text)
 		}
 
@@ -151,17 +163,18 @@ func walkNode(buf *bytes.Buffer, n *html.Node) {
 
 		emitAttributes(buf, n)
 
+		childRaw := inRawText || rawTextElements[n.Data]
 		for c := n.FirstChild; c != nil; c = c.NextSibling {
-			walkNode(buf, c)
+			walkNode(buf, c, childRaw)
 		}
 
-		if isBlock || n.Data == "br" {
+		if isBlock {
 			ensureNewline(buf)
 		}
 
 	default:
 		for c := n.FirstChild; c != nil; c = c.NextSibling {
-			walkNode(buf, c)
+			walkNode(buf, c, inRawText)
 		}
 	}
 }
@@ -215,15 +228,6 @@ func stripInvisible(data []byte) []byte {
 	return []byte(invisibleReplacer.Replace(string(data)))
 }
 
-func decodeResidualEntities(data []byte) []byte {
-	s := string(data)
-	decoded := residualEntityReplacer.Replace(s)
-	if decoded == s {
-		return data
-	}
-	return []byte(decoded)
-}
-
 // normalizeWhitespace collapses runs of blank lines and trims leading/trailing whitespace.
 func normalizeWhitespace(data []byte) []byte {
 	lines := bytes.Split(data, []byte("\n"))
diff --git a/pkg/decoders/html_test.go b/pkg/decoders/html_test.go
index e6a97f576ca4..403ebdb2f93a 100644
--- a/pkg/decoders/html_test.go
+++ b/pkg/decoders/html_test.go
@@ -154,6 +154,32 @@ func TestHTML_FromChunk(t *testing.T) {
 			chunk: &sources.Chunk{Data: []byte(`<p>text</p><style>body { background: url("https://cdn.com?key=secret"); }</style>`)},
 			want:  "text\nbody { background: url(\"https://cdn.com?key=secret\"); }",
 		},
+		{
+			// Script following an inline element must NOT concatenate with
+			// the preceding text; it needs its own newline boundary.
+			name:  "script adjacent to inline text gets boundary",
+			chunk: &sources.Chunk{Data: []byte(`<span>text</span><script>var key="secret";</script>`)},
+			want:  "text\nvar key=\"secret\";",
+		},
+		{
+			// Style following an inline element must NOT concatenate.
+			name:  "style adjacent to inline text gets boundary",
+			chunk: &sources.Chunk{Data: []byte(`<span>text</span><style>.x { color: red; }</style>`)},
+			want:  "text\n.x { color: red; }",
+		},
+		{
+			// Entity-like sequences in script content are raw text and must
+			// NOT be decoded by the residual entity replacer.
+			name:  "entities in script preserved as raw text",
+			chunk: &sources.Chunk{Data: []byte(`<script>var url = "a=1&amp;b=2";</script>`)},
+			want:  `var url = "a=1&amp;b=2";`,
+		},
+		{
+			// Entity-like sequences in style content are raw text.
+			name:  "entities in style preserved as raw text",
+			chunk: &sources.Chunk{Data: []byte(`<style>body::after { content: "&amp;copy"; }</style>`)},
+			want:  `body::after { content: "&amp;copy"; }`,
+		},
 		{
 			// HTML comments are a common place for debug credentials and
 			// TODO notes with hardcoded passwords.