Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/decoders/decoders.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ func DefaultDecoders() []Decoder {
&Base64{},
&UTF16{},
&EscapedUnicode{},
&HTML{},
}
}

Expand Down
251 changes: 251 additions & 0 deletions pkg/decoders/html.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
package decoders

import (
"bytes"
"net/url"
"regexp"
"strings"

"golang.org/x/net/html"

"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

// HTML is a decoder that extracts textual content from HTML documents.
// It produces a normalized view containing visible text, attribute values,
// script/style content, and HTML comments with entities and URL-encoding decoded.
type HTML struct {
// Enabled controls whether the decoder is active. When nil, the decoder
// is always active. Inject a function that checks a feature flag to
// allow dynamic toggling without restarting the scanner.
Enabled func() bool
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is Enabled really needed if nothing by default uses this HTML package?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the reason to include Enabled is so we have a kill switch for EE (my understanding is that rollout will be gradual), but OSS will be able to use it out of the box.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes but for EE blocking, you should just need to deal with the feature flag in thog. If the intention for OSS is to not have a flag at all.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah okay I hear you. I think the tradeoff is that checking the flag in pipeline.go only evaluates at startup, so after toggling in configcat, we would require a scanner restart for it to take effect (which may be expected behavior from customers). That would be fine for the initial release, but the Enabled callback would let us toggle per-customer via ConfigCat at runtime without restarts. I imagine something similar to this has been debated before, so it is very possible my position is known to be no good.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confirming our offline chat, I agree with your suggestion to move this to thog.

Copy link
Copy Markdown
Contributor

@casey-tran casey-tran Apr 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ConfigCat flags should be checked every 10 seconds here.

}

func (d *HTML) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_HTML
}

var htmlTagPattern = regexp.MustCompile(`<[a-zA-Z][a-zA-Z0-9]*[\s>/]`)

// highSignalAttrs are attribute names whose values are extracted into the
// decoded output because they commonly contain URLs, tokens, or other secrets.
var highSignalAttrs = map[string]bool{
"href": true,
"src": true,
"action": true,
"value": true,
"content": true,
"alt": true,
"title": true,
"xlink:href": true,
}

// syntaxHighlightPrefixes lists CSS class prefixes used by syntax highlighting
// libraries. Elements with these classes mark logical line boundaries in code
// blocks where the platform (e.g. Teams) strips actual newlines.
var syntaxHighlightPrefixes = []string{"hljs-"}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think it would be helpful to note that hljs- is a MS Teams specific use case? In case people need to append more to the slice over time as sources change.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, I edited the comment to call out the Teams use case and guide future non-Teams additions.


// residualEntityReplacer decodes common HTML entities that survive double-encoding.
// When content is entity-encoded twice (e.g. &amp;amp;), the parser's first pass
// leaves residual entity sequences that this replacer cleans up.
var residualEntityReplacer = strings.NewReplacer(
"&amp;", "&",
"&lt;", "<",
"&gt;", ">",
"&quot;", `"`,
"&#39;", "'",
"&apos;", "'",
)

// invisibleReplacer strips zero-width and invisible Unicode codepoints that
// rich text editors may insert between characters, breaking detector regexes.
var invisibleReplacer = strings.NewReplacer(
"\u200B", "", // zero-width space
"\u200C", "", // zero-width non-joiner
"\u200D", "", // zero-width joiner
"\uFEFF", "", // byte order mark / zero-width no-break space
"\u00AD", "", // soft hyphen
"\u2060", "", // word joiner
"\u200E", "", // left-to-right mark
"\u200F", "", // right-to-left mark
)

// blockElements insert newline boundaries when encountered during extraction.
var blockElements = map[string]bool{
"p": true, "div": true, "br": true, "hr": true,
"h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true,
"li": true, "ol": true, "ul": true,
"tr": true, "td": true, "th": true, "table": true, "thead": true, "tbody": true, "tfoot": true,
"blockquote": true, "section": true, "article": true, "header": true, "footer": true,
"pre": true, "address": true, "figcaption": true, "figure": true,
"details": true, "summary": true, "main": true, "nav": true, "aside": true,
"form": true, "fieldset": true, "legend": true,
"dd": true, "dt": true, "dl": true,
}

func (d *HTML) FromChunk(chunk *sources.Chunk) *DecodableChunk {
if d.Enabled != nil && !d.Enabled() {
return nil
}
if chunk == nil || len(chunk.Data) == 0 {
return nil
}

if !looksLikeHTML(chunk.Data) {
return nil
}

extracted := extractHTML(chunk.Data)
if len(extracted) == 0 {
return nil
}

if bytes.Equal(chunk.Data, extracted) {
return nil
}

chunk.Data = extracted
return &DecodableChunk{Chunk: chunk, DecoderType: d.Type()}
}

func looksLikeHTML(data []byte) bool {
return htmlTagPattern.Match(data)
}

func extractHTML(data []byte) []byte {
doc, err := html.Parse(bytes.NewReader(data))
if err != nil {
return nil
}

var buf bytes.Buffer
buf.Grow(len(data))

walkNode(&buf, doc)

result := stripInvisible(buf.Bytes())
result = decodeResidualEntities(result)
return normalizeWhitespace(result)
}

func walkNode(buf *bytes.Buffer, n *html.Node) {
switch n.Type {
case html.TextNode:
text := n.Data
if text != "" {
buf.WriteString(text)
}

case html.CommentNode:
if content := strings.TrimSpace(n.Data); content != "" {
ensureNewline(buf)
buf.WriteString(content)
ensureNewline(buf)
}

case html.ElementNode:
isBlock := blockElements[n.Data]

if isBlock {
ensureNewline(buf)
} else if hasSyntaxHighlightClass(n) {
ensureNewline(buf)
}

emitAttributes(buf, n)

for c := n.FirstChild; c != nil; c = c.NextSibling {
walkNode(buf, c)
}

if isBlock || n.Data == "br" {
ensureNewline(buf)
}

default:
for c := n.FirstChild; c != nil; c = c.NextSibling {
walkNode(buf, c)
}
}
}

func hasSyntaxHighlightClass(n *html.Node) bool {
for _, attr := range n.Attr {
if attr.Key != "class" {
continue
}
for _, cls := range strings.Fields(attr.Val) {
for _, prefix := range syntaxHighlightPrefixes {
if strings.HasPrefix(cls, prefix) {
return true
}
}
}
}
return false
}

func emitAttributes(buf *bytes.Buffer, n *html.Node) {
for _, attr := range n.Attr {
isDataAttr := strings.HasPrefix(attr.Key, "data-")
if !highSignalAttrs[attr.Key] && !isDataAttr {
continue
}
val := strings.TrimSpace(attr.Val)
if val == "" || val == "#" {
continue
}
decoded, err := url.QueryUnescape(val)
if err == nil && decoded != val {
val = decoded
}
ensureNewline(buf)
buf.WriteString(val)
ensureNewline(buf)
}
}

func ensureNewline(buf *bytes.Buffer) {
if buf.Len() == 0 {
return
}
if buf.Bytes()[buf.Len()-1] != '\n' {
buf.WriteByte('\n')
}
}

func stripInvisible(data []byte) []byte {
return []byte(invisibleReplacer.Replace(string(data)))
}

func decodeResidualEntities(data []byte) []byte {
s := string(data)
decoded := residualEntityReplacer.Replace(s)
if decoded == s {
return data
}
return []byte(decoded)
}

// normalizeWhitespace collapses runs of blank lines and trims leading/trailing whitespace.
func normalizeWhitespace(data []byte) []byte {
lines := bytes.Split(data, []byte("\n"))
var result [][]byte
prevBlank := true
for _, line := range lines {
trimmed := bytes.TrimSpace(line)
if len(trimmed) == 0 {
if !prevBlank {
prevBlank = true
}
continue
}
if prevBlank && len(result) > 0 {
result = append(result, []byte(""))
}
result = append(result, trimmed)
prevBlank = false
}
return bytes.Join(result, []byte("\n"))
}
Loading
Loading