-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Add HTML decoder for secret detection in HTML-formatted sources #4840
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
alafiand
wants to merge
7
commits into
main
Choose a base branch
from
dl.276-new-html-decoder
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 6 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
cd28c03
Add HTML decoder for secret detection in HTML-formatted sources
alafiand 86a9a9f
Merge branch 'main' into dl.276-new-html-decoder
alafiand e433092
Fix dead code and plus-sign corruption in HTML decoder
alafiand 779dcef
Merge branch 'main' into dl.276-new-html-decoder
alafiand 38a1763
updated comment around syntaxHighlightPrefixes to guide future additions
alafiand 2a2997c
removed Enabled func from HTML struct to follow normal flag conventions
alafiand c0d437a
Fix script/style boundary, redundant br check, and raw-text entity co…
alafiand File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ func DefaultDecoders() []Decoder { | |
| &Base64{}, | ||
| &UTF16{}, | ||
| &EscapedUnicode{}, | ||
| &HTML{}, | ||
| } | ||
| } | ||
|
|
||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,247 @@ | ||
| package decoders | ||
|
|
||
| import ( | ||
| "bytes" | ||
| "net/url" | ||
| "regexp" | ||
| "strings" | ||
|
|
||
| "golang.org/x/net/html" | ||
|
|
||
| "github.com/trufflesecurity/trufflehog/v3/pkg/feature" | ||
| "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" | ||
| "github.com/trufflesecurity/trufflehog/v3/pkg/sources" | ||
| ) | ||
|
|
||
| // HTML is a decoder that extracts textual content from HTML documents. | ||
| // It produces a normalized view containing visible text, attribute values, | ||
| // script/style content, and HTML comments with entities and URL-encoding decoded. | ||
| // Gated at runtime by feature.HTMLDecoderEnabled. | ||
| type HTML struct{} | ||
|
|
||
| func (d *HTML) Type() detectorspb.DecoderType { | ||
| return detectorspb.DecoderType_HTML | ||
| } | ||
|
|
||
| var htmlTagPattern = regexp.MustCompile(`<[a-zA-Z][a-zA-Z0-9]*[\s>/]`) | ||
|
|
||
| // highSignalAttrs are attribute names whose values are extracted into the | ||
| // decoded output because they commonly contain URLs, tokens, or other secrets. | ||
| var highSignalAttrs = map[string]bool{ | ||
| "href": true, | ||
| "src": true, | ||
| "action": true, | ||
| "value": true, | ||
| "content": true, | ||
| "alt": true, | ||
| "title": true, | ||
| } | ||
|
|
||
| // syntaxHighlightPrefixes lists CSS class prefixes used by syntax highlighting | ||
| // libraries. Elements with these classes mark logical line boundaries in code | ||
| // blocks where the platform (e.g. Teams) strips actual newlines. | ||
| var syntaxHighlightPrefixes = []string{"hljs-"} | ||
|
|
||
| // residualEntityReplacer decodes common HTML entities that survive double-encoding. | ||
| // When content is entity-encoded twice (e.g. &amp;), the parser's first pass | ||
| // leaves residual entity sequences that this replacer cleans up. | ||
| var residualEntityReplacer = strings.NewReplacer( | ||
| "&", "&", | ||
| "<", "<", | ||
| ">", ">", | ||
| """, `"`, | ||
| "'", "'", | ||
| "'", "'", | ||
| ) | ||
|
|
||
| // invisibleReplacer strips zero-width and invisible Unicode codepoints that | ||
| // rich text editors may insert between characters, breaking detector regexes. | ||
| var invisibleReplacer = strings.NewReplacer( | ||
| "\u200B", "", // zero-width space | ||
| "\u200C", "", // zero-width non-joiner | ||
| "\u200D", "", // zero-width joiner | ||
| "\uFEFF", "", // byte order mark / zero-width no-break space | ||
| "\u00AD", "", // soft hyphen | ||
| "\u2060", "", // word joiner | ||
| "\u200E", "", // left-to-right mark | ||
| "\u200F", "", // right-to-left mark | ||
| ) | ||
|
|
||
| // blockElements insert newline boundaries when encountered during extraction. | ||
| var blockElements = map[string]bool{ | ||
| "p": true, "div": true, "br": true, "hr": true, | ||
| "h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true, | ||
| "li": true, "ol": true, "ul": true, | ||
| "tr": true, "td": true, "th": true, "table": true, "thead": true, "tbody": true, "tfoot": true, | ||
| "blockquote": true, "section": true, "article": true, "header": true, "footer": true, | ||
| "pre": true, "address": true, "figcaption": true, "figure": true, | ||
| "details": true, "summary": true, "main": true, "nav": true, "aside": true, | ||
| "form": true, "fieldset": true, "legend": true, | ||
| "dd": true, "dt": true, "dl": true, | ||
| } | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| func (d *HTML) FromChunk(chunk *sources.Chunk) *DecodableChunk { | ||
| if !feature.HTMLDecoderEnabled.Load() { | ||
| return nil | ||
| } | ||
| if chunk == nil || len(chunk.Data) == 0 { | ||
| return nil | ||
| } | ||
|
|
||
| if !looksLikeHTML(chunk.Data) { | ||
| return nil | ||
| } | ||
|
|
||
| extracted := extractHTML(chunk.Data) | ||
| if len(extracted) == 0 { | ||
| return nil | ||
| } | ||
|
|
||
| if bytes.Equal(chunk.Data, extracted) { | ||
| return nil | ||
| } | ||
|
|
||
| chunk.Data = extracted | ||
| return &DecodableChunk{Chunk: chunk, DecoderType: d.Type()} | ||
| } | ||
|
|
||
| func looksLikeHTML(data []byte) bool { | ||
| return htmlTagPattern.Match(data) | ||
| } | ||
|
|
||
| func extractHTML(data []byte) []byte { | ||
| doc, err := html.Parse(bytes.NewReader(data)) | ||
| if err != nil { | ||
| return nil | ||
| } | ||
|
|
||
| var buf bytes.Buffer | ||
| buf.Grow(len(data)) | ||
|
|
||
| walkNode(&buf, doc) | ||
|
|
||
| result := stripInvisible(buf.Bytes()) | ||
| result = decodeResidualEntities(result) | ||
| return normalizeWhitespace(result) | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| func walkNode(buf *bytes.Buffer, n *html.Node) { | ||
| switch n.Type { | ||
| case html.TextNode: | ||
| text := n.Data | ||
| if text != "" { | ||
| buf.WriteString(text) | ||
| } | ||
|
|
||
| case html.CommentNode: | ||
| if content := strings.TrimSpace(n.Data); content != "" { | ||
| ensureNewline(buf) | ||
| buf.WriteString(content) | ||
| ensureNewline(buf) | ||
| } | ||
|
|
||
| case html.ElementNode: | ||
| isBlock := blockElements[n.Data] | ||
|
|
||
| if isBlock { | ||
| ensureNewline(buf) | ||
| } else if hasSyntaxHighlightClass(n) { | ||
| ensureNewline(buf) | ||
| } | ||
|
|
||
| emitAttributes(buf, n) | ||
|
|
||
| for c := n.FirstChild; c != nil; c = c.NextSibling { | ||
| walkNode(buf, c) | ||
| } | ||
|
|
||
| if isBlock || n.Data == "br" { | ||
| ensureNewline(buf) | ||
| } | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| default: | ||
| for c := n.FirstChild; c != nil; c = c.NextSibling { | ||
| walkNode(buf, c) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func hasSyntaxHighlightClass(n *html.Node) bool { | ||
| for _, attr := range n.Attr { | ||
| if attr.Key != "class" { | ||
| continue | ||
| } | ||
| for _, cls := range strings.Fields(attr.Val) { | ||
| for _, prefix := range syntaxHighlightPrefixes { | ||
| if strings.HasPrefix(cls, prefix) { | ||
| return true | ||
| } | ||
| } | ||
| } | ||
| } | ||
| return false | ||
| } | ||
|
|
||
| func emitAttributes(buf *bytes.Buffer, n *html.Node) { | ||
| for _, attr := range n.Attr { | ||
| isDataAttr := strings.HasPrefix(attr.Key, "data-") | ||
| if !highSignalAttrs[attr.Key] && !isDataAttr { | ||
| continue | ||
| } | ||
| val := strings.TrimSpace(attr.Val) | ||
| if val == "" || val == "#" { | ||
| continue | ||
| } | ||
| decoded, err := url.PathUnescape(val) | ||
| if err == nil && decoded != val { | ||
| val = decoded | ||
| } | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ensureNewline(buf) | ||
| buf.WriteString(val) | ||
| ensureNewline(buf) | ||
| } | ||
| } | ||
|
|
||
| func ensureNewline(buf *bytes.Buffer) { | ||
| if buf.Len() == 0 { | ||
| return | ||
| } | ||
| if buf.Bytes()[buf.Len()-1] != '\n' { | ||
| buf.WriteByte('\n') | ||
| } | ||
| } | ||
|
|
||
| func stripInvisible(data []byte) []byte { | ||
| return []byte(invisibleReplacer.Replace(string(data))) | ||
| } | ||
|
|
||
| func decodeResidualEntities(data []byte) []byte { | ||
| s := string(data) | ||
| decoded := residualEntityReplacer.Replace(s) | ||
| if decoded == s { | ||
| return data | ||
| } | ||
| return []byte(decoded) | ||
| } | ||
|
|
||
| // normalizeWhitespace collapses runs of blank lines and trims leading/trailing whitespace. | ||
| func normalizeWhitespace(data []byte) []byte { | ||
| lines := bytes.Split(data, []byte("\n")) | ||
| var result [][]byte | ||
| prevBlank := true | ||
| for _, line := range lines { | ||
| trimmed := bytes.TrimSpace(line) | ||
| if len(trimmed) == 0 { | ||
| if !prevBlank { | ||
| prevBlank = true | ||
| } | ||
| continue | ||
| } | ||
| if prevBlank && len(result) > 0 { | ||
| result = append(result, []byte("")) | ||
| } | ||
| result = append(result, trimmed) | ||
| prevBlank = false | ||
| } | ||
| return bytes.Join(result, []byte("\n")) | ||
| } | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you think it would be helpful to note that
hljs-is a MS Teams specific use case? In case people need to append more to the slice over time as sources change.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree, I edited the comment to call out the Teams use case and guide future non-Teams additions.