-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Add HTML decoder for secret detection in HTML-formatted sources #4840
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 4 commits
cd28c03
86a9a9f
e433092
779dcef
38a1763
2a2997c
c0d437a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ func DefaultDecoders() []Decoder { | |
| &Base64{}, | ||
| &UTF16{}, | ||
| &EscapedUnicode{}, | ||
| &HTML{}, | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,250 @@ | ||
| package decoders | ||
|
|
||
| import ( | ||
| "bytes" | ||
| "net/url" | ||
| "regexp" | ||
| "strings" | ||
|
|
||
| "golang.org/x/net/html" | ||
|
|
||
| "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" | ||
| "github.com/trufflesecurity/trufflehog/v3/pkg/sources" | ||
| ) | ||
|
|
||
| // HTML is a decoder that extracts textual content from HTML documents. | ||
| // It produces a normalized view containing visible text, attribute values, | ||
| // script/style content, and HTML comments with entities and URL-encoding decoded. | ||
| type HTML struct { | ||
| // Enabled controls whether the decoder is active. When nil, the decoder | ||
| // is always active. Inject a function that checks a feature flag to | ||
| // allow dynamic toggling without restarting the scanner. | ||
| Enabled func() bool | ||
| } | ||
|
|
||
| func (d *HTML) Type() detectorspb.DecoderType { | ||
| return detectorspb.DecoderType_HTML | ||
| } | ||
|
|
||
| var htmlTagPattern = regexp.MustCompile(`<[a-zA-Z][a-zA-Z0-9]*[\s>/]`) | ||
|
|
||
| // highSignalAttrs are attribute names whose values are extracted into the | ||
| // decoded output because they commonly contain URLs, tokens, or other secrets. | ||
| var highSignalAttrs = map[string]bool{ | ||
| "href": true, | ||
| "src": true, | ||
| "action": true, | ||
| "value": true, | ||
| "content": true, | ||
| "alt": true, | ||
| "title": true, | ||
| } | ||
|
|
||
| // syntaxHighlightPrefixes lists CSS class prefixes used by syntax highlighting | ||
| // libraries. Elements with these classes mark logical line boundaries in code | ||
| // blocks where the platform (e.g. Teams) strips actual newlines. | ||
| var syntaxHighlightPrefixes = []string{"hljs-"} | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think it would be helpful to note that
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree, I edited the comment to call out the Teams use case and guide future non-Teams additions. |
||
|
|
||
| // residualEntityReplacer decodes common HTML entities that survive double-encoding. | ||
| // When content is entity-encoded twice (e.g. &amp;), the parser's first pass | ||
| // leaves residual entity sequences that this replacer cleans up. | ||
| var residualEntityReplacer = strings.NewReplacer( | ||
| "&", "&", | ||
| "<", "<", | ||
| ">", ">", | ||
| """, `"`, | ||
| "'", "'", | ||
| "'", "'", | ||
| ) | ||
|
|
||
| // invisibleReplacer strips zero-width and invisible Unicode codepoints that | ||
| // rich text editors may insert between characters, breaking detector regexes. | ||
| var invisibleReplacer = strings.NewReplacer( | ||
| "\u200B", "", // zero-width space | ||
| "\u200C", "", // zero-width non-joiner | ||
| "\u200D", "", // zero-width joiner | ||
| "\uFEFF", "", // byte order mark / zero-width no-break space | ||
| "\u00AD", "", // soft hyphen | ||
| "\u2060", "", // word joiner | ||
| "\u200E", "", // left-to-right mark | ||
| "\u200F", "", // right-to-left mark | ||
| ) | ||
|
|
||
| // blockElements insert newline boundaries when encountered during extraction. | ||
| var blockElements = map[string]bool{ | ||
| "p": true, "div": true, "br": true, "hr": true, | ||
| "h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true, | ||
| "li": true, "ol": true, "ul": true, | ||
| "tr": true, "td": true, "th": true, "table": true, "thead": true, "tbody": true, "tfoot": true, | ||
| "blockquote": true, "section": true, "article": true, "header": true, "footer": true, | ||
| "pre": true, "address": true, "figcaption": true, "figure": true, | ||
| "details": true, "summary": true, "main": true, "nav": true, "aside": true, | ||
| "form": true, "fieldset": true, "legend": true, | ||
| "dd": true, "dt": true, "dl": true, | ||
| } | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| func (d *HTML) FromChunk(chunk *sources.Chunk) *DecodableChunk { | ||
| if d.Enabled != nil && !d.Enabled() { | ||
| return nil | ||
| } | ||
| if chunk == nil || len(chunk.Data) == 0 { | ||
| return nil | ||
| } | ||
|
|
||
| if !looksLikeHTML(chunk.Data) { | ||
| return nil | ||
| } | ||
|
|
||
| extracted := extractHTML(chunk.Data) | ||
| if len(extracted) == 0 { | ||
| return nil | ||
| } | ||
|
|
||
| if bytes.Equal(chunk.Data, extracted) { | ||
| return nil | ||
| } | ||
|
|
||
| chunk.Data = extracted | ||
| return &DecodableChunk{Chunk: chunk, DecoderType: d.Type()} | ||
| } | ||
|
|
||
| func looksLikeHTML(data []byte) bool { | ||
| return htmlTagPattern.Match(data) | ||
| } | ||
|
|
||
| func extractHTML(data []byte) []byte { | ||
| doc, err := html.Parse(bytes.NewReader(data)) | ||
| if err != nil { | ||
| return nil | ||
| } | ||
|
|
||
| var buf bytes.Buffer | ||
| buf.Grow(len(data)) | ||
|
|
||
| walkNode(&buf, doc) | ||
|
|
||
| result := stripInvisible(buf.Bytes()) | ||
| result = decodeResidualEntities(result) | ||
| return normalizeWhitespace(result) | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| func walkNode(buf *bytes.Buffer, n *html.Node) { | ||
| switch n.Type { | ||
| case html.TextNode: | ||
| text := n.Data | ||
| if text != "" { | ||
| buf.WriteString(text) | ||
| } | ||
|
|
||
| case html.CommentNode: | ||
| if content := strings.TrimSpace(n.Data); content != "" { | ||
| ensureNewline(buf) | ||
| buf.WriteString(content) | ||
| ensureNewline(buf) | ||
| } | ||
|
|
||
| case html.ElementNode: | ||
| isBlock := blockElements[n.Data] | ||
|
|
||
| if isBlock { | ||
| ensureNewline(buf) | ||
| } else if hasSyntaxHighlightClass(n) { | ||
| ensureNewline(buf) | ||
| } | ||
|
|
||
| emitAttributes(buf, n) | ||
|
|
||
| for c := n.FirstChild; c != nil; c = c.NextSibling { | ||
| walkNode(buf, c) | ||
| } | ||
|
|
||
| if isBlock || n.Data == "br" { | ||
| ensureNewline(buf) | ||
| } | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| default: | ||
| for c := n.FirstChild; c != nil; c = c.NextSibling { | ||
| walkNode(buf, c) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func hasSyntaxHighlightClass(n *html.Node) bool { | ||
| for _, attr := range n.Attr { | ||
| if attr.Key != "class" { | ||
| continue | ||
| } | ||
| for _, cls := range strings.Fields(attr.Val) { | ||
| for _, prefix := range syntaxHighlightPrefixes { | ||
| if strings.HasPrefix(cls, prefix) { | ||
| return true | ||
| } | ||
| } | ||
| } | ||
| } | ||
| return false | ||
| } | ||
|
|
||
| func emitAttributes(buf *bytes.Buffer, n *html.Node) { | ||
| for _, attr := range n.Attr { | ||
| isDataAttr := strings.HasPrefix(attr.Key, "data-") | ||
| if !highSignalAttrs[attr.Key] && !isDataAttr { | ||
| continue | ||
| } | ||
| val := strings.TrimSpace(attr.Val) | ||
| if val == "" || val == "#" { | ||
| continue | ||
| } | ||
| decoded, err := url.PathUnescape(val) | ||
| if err == nil && decoded != val { | ||
| val = decoded | ||
| } | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ensureNewline(buf) | ||
| buf.WriteString(val) | ||
| ensureNewline(buf) | ||
| } | ||
| } | ||
|
|
||
| func ensureNewline(buf *bytes.Buffer) { | ||
| if buf.Len() == 0 { | ||
| return | ||
| } | ||
| if buf.Bytes()[buf.Len()-1] != '\n' { | ||
| buf.WriteByte('\n') | ||
| } | ||
| } | ||
|
|
||
| func stripInvisible(data []byte) []byte { | ||
| return []byte(invisibleReplacer.Replace(string(data))) | ||
| } | ||
|
|
||
| func decodeResidualEntities(data []byte) []byte { | ||
| s := string(data) | ||
| decoded := residualEntityReplacer.Replace(s) | ||
| if decoded == s { | ||
| return data | ||
| } | ||
| return []byte(decoded) | ||
| } | ||
|
|
||
| // normalizeWhitespace collapses runs of blank lines and trims leading/trailing whitespace. | ||
| func normalizeWhitespace(data []byte) []byte { | ||
| lines := bytes.Split(data, []byte("\n")) | ||
| var result [][]byte | ||
| prevBlank := true | ||
| for _, line := range lines { | ||
| trimmed := bytes.TrimSpace(line) | ||
| if len(trimmed) == 0 { | ||
| if !prevBlank { | ||
| prevBlank = true | ||
| } | ||
| continue | ||
| } | ||
| if prevBlank && len(result) > 0 { | ||
| result = append(result, []byte("")) | ||
| } | ||
| result = append(result, trimmed) | ||
| prevBlank = false | ||
| } | ||
| return bytes.Join(result, []byte("\n")) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is
Enabledreally needed if nothing by default uses this HTML package?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the reason to include
Enabledis so we have a kill switch for EE (my understanding is that rollout will be gradual), but OSS will be able to use it out of the box.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes but for EE blocking, you should just need to deal with the feature flag in thog. If the intention for OSS is to not have a flag at all.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah okay I hear you. I think the tradeoff is that checking the flag in
pipeline.goonly evaluates at startup, so after toggling in configcat, we would require a scanner restart for it to take effect (which may be expected behavior from customers). That would be fine for the initial release, but theEnabledcallback would let us toggle per-customer via ConfigCat at runtime without restarts. I imagine something similar to this has been debated before, so it is very possible my position is known to be no good.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Confirming our offline chat, I agree with your suggestion to move this to thog.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The ConfigCat flags should be checked every 10 seconds here.