trufflesecurity · alafiand · Mar 25, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 2, 2026
@@ -12,6 +12,7 @@ func DefaultDecoders() []Decoder {
 		&Base64{},
 		&UTF16{},
 		&EscapedUnicode{},
+		&HTML{},
 	}
 }
 

@@ -0,0 +1,251 @@
+package decoders
+
+import (
+	"bytes"
+	"net/url"
+	"regexp"
+	"strings"
+
+	"golang.org/x/net/html"
+
+	"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
+	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
+)
+
+// HTML is a decoder that extracts textual content from HTML documents.
+// It produces a normalized view containing visible text, attribute values,
+// script/style content, and HTML comments with entities and URL-encoding decoded.
+type HTML struct {
+	// Enabled controls whether the decoder is active. When nil, the decoder
+	// is always active. Inject a function that checks a feature flag to
+	// allow dynamic toggling without restarting the scanner.
+	Enabled func() bool
+}
+
+func (d *HTML) Type() detectorspb.DecoderType {
+	return detectorspb.DecoderType_HTML
+}
+
+var htmlTagPattern = regexp.MustCompile(`<[a-zA-Z][a-zA-Z0-9]*[\s>/]`)
+
+// highSignalAttrs are attribute names whose values are extracted into the
+// decoded output because they commonly contain URLs, tokens, or other secrets.
+var highSignalAttrs = map[string]bool{
+	"href":       true,
+	"src":        true,
+	"action":     true,
+	"value":      true,
+	"content":    true,
+	"alt":        true,
+	"title":      true,
+	"xlink:href": true,
+}
+
+// syntaxHighlightPrefixes lists CSS class prefixes used by syntax highlighting
+// libraries. Elements with these classes mark logical line boundaries in code
+// blocks where the platform (e.g. Teams) strips actual newlines.
+var syntaxHighlightPrefixes = []string{"hljs-"}
+
+// residualEntityReplacer decodes common HTML entities that survive double-encoding.
+// When content is entity-encoded twice (e.g. &amp;amp;), the parser's first pass
+// leaves residual entity sequences that this replacer cleans up.
+var residualEntityReplacer = strings.NewReplacer(
+	"&amp;", "&",
+	"&lt;", "<",
+	"&gt;", ">",
+	"&quot;", `"`,
+	"&#39;", "'",
+	"&apos;", "'",
+)
+
+// invisibleReplacer strips zero-width and invisible Unicode codepoints that
+// rich text editors may insert between characters, breaking detector regexes.
+var invisibleReplacer = strings.NewReplacer(
+	"\u200B", "", // zero-width space
+	"\u200C", "", // zero-width non-joiner
+	"\u200D", "", // zero-width joiner
+	"\uFEFF", "", // byte order mark / zero-width no-break space
+	"\u00AD", "", // soft hyphen
+	"\u2060", "", // word joiner
+	"\u200E", "", // left-to-right mark
+	"\u200F", "", // right-to-left mark
+)
+
+// blockElements insert newline boundaries when encountered during extraction.
+var blockElements = map[string]bool{
+	"p": true, "div": true, "br": true, "hr": true,
+	"h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true,
+	"li": true, "ol": true, "ul": true,
+	"tr": true, "td": true, "th": true, "table": true, "thead": true, "tbody": true, "tfoot": true,
+	"blockquote": true, "section": true, "article": true, "header": true, "footer": true,
+	"pre": true, "address": true, "figcaption": true, "figure": true,
+	"details": true, "summary": true, "main": true, "nav": true, "aside": true,
+	"form": true, "fieldset": true, "legend": true,
+	"dd": true, "dt": true, "dl": true,
+}
+
+func (d *HTML) FromChunk(chunk *sources.Chunk) *DecodableChunk {
+	if d.Enabled != nil && !d.Enabled() {
+		return nil
+	}
+	if chunk == nil || len(chunk.Data) == 0 {
+		return nil
+	}
+
+	if !looksLikeHTML(chunk.Data) {
+		return nil
+	}
+
+	extracted := extractHTML(chunk.Data)
+	if len(extracted) == 0 {
+		return nil
+	}
+
+	if bytes.Equal(chunk.Data, extracted) {
+		return nil
+	}
+
+	chunk.Data = extracted
+	return &DecodableChunk{Chunk: chunk, DecoderType: d.Type()}
+}
+
+func looksLikeHTML(data []byte) bool {
+	return htmlTagPattern.Match(data)
+}
+
+func extractHTML(data []byte) []byte {
+	doc, err := html.Parse(bytes.NewReader(data))
+	if err != nil {
+		return nil
+	}
+
+	var buf bytes.Buffer
+	buf.Grow(len(data))
+
+	walkNode(&buf, doc)
+
+	result := stripInvisible(buf.Bytes())
+	result = decodeResidualEntities(result)
+	return normalizeWhitespace(result)
+}
+
+func walkNode(buf *bytes.Buffer, n *html.Node) {
+	switch n.Type {
+	case html.TextNode:
+		text := n.Data
+		if text != "" {
+			buf.WriteString(text)
+		}
+
+	case html.CommentNode:
+		if content := strings.TrimSpace(n.Data); content != "" {
+			ensureNewline(buf)
+			buf.WriteString(content)
+			ensureNewline(buf)
+		}
+
+	case html.ElementNode:
+		isBlock := blockElements[n.Data]
+
+		if isBlock {
+			ensureNewline(buf)
+		} else if hasSyntaxHighlightClass(n) {
+			ensureNewline(buf)
+		}
+
+		emitAttributes(buf, n)
+
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			walkNode(buf, c)
+		}
+
+		if isBlock || n.Data == "br" {
+			ensureNewline(buf)
+		}
+
+	default:
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			walkNode(buf, c)
+		}
+	}
+}
+
+func hasSyntaxHighlightClass(n *html.Node) bool {
+	for _, attr := range n.Attr {
+		if attr.Key != "class" {
+			continue
+		}
+		for _, cls := range strings.Fields(attr.Val) {
+			for _, prefix := range syntaxHighlightPrefixes {
+				if strings.HasPrefix(cls, prefix) {
+					return true
+				}
+			}
+		}
+	}
+	return false
+}
+
+func emitAttributes(buf *bytes.Buffer, n *html.Node) {
+	for _, attr := range n.Attr {
+		isDataAttr := strings.HasPrefix(attr.Key, "data-")
+		if !highSignalAttrs[attr.Key] && !isDataAttr {
+			continue
+		}
+		val := strings.TrimSpace(attr.Val)
+		if val == "" || val == "#" {
+			continue
+		}
+		decoded, err := url.QueryUnescape(val)
+		if err == nil && decoded != val {
+			val = decoded
+		}
+		ensureNewline(buf)
+		buf.WriteString(val)
+		ensureNewline(buf)
+	}
+}
+
+func ensureNewline(buf *bytes.Buffer) {
+	if buf.Len() == 0 {
+		return
+	}
+	if buf.Bytes()[buf.Len()-1] != '\n' {
+		buf.WriteByte('\n')
+	}
+}
+
+func stripInvisible(data []byte) []byte {
+	return []byte(invisibleReplacer.Replace(string(data)))
+}
+
+func decodeResidualEntities(data []byte) []byte {
+	s := string(data)
+	decoded := residualEntityReplacer.Replace(s)
+	if decoded == s {
+		return data
+	}
+	return []byte(decoded)
+}
+
+// normalizeWhitespace collapses runs of blank lines and trims leading/trailing whitespace.
+func normalizeWhitespace(data []byte) []byte {
+	lines := bytes.Split(data, []byte("\n"))
+	var result [][]byte
+	prevBlank := true
+	for _, line := range lines {
+		trimmed := bytes.TrimSpace(line)
+		if len(trimmed) == 0 {
+			if !prevBlank {
+				prevBlank = true
+			}
+			continue
+		}
+		if prevBlank && len(result) > 0 {
+			result = append(result, []byte(""))
+		}
+		result = append(result, trimmed)
+		prevBlank = false
+	}
+	return bytes.Join(result, []byte("\n"))
+}