Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 100 additions & 63 deletions modules/highlight/highlight.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,9 @@
package highlight

import (
"bufio"
"bytes"
"fmt"
gohtml "html"
"html/template"
"io"
"strings"
"slices"
"sync"

"code.gitea.io/gitea/modules/log"
Expand All @@ -23,12 +19,14 @@ import (
"github.com/alecthomas/chroma/v2/styles"
)

// don't index files larger than this many bytes for performance purposes
// don't highlight files larger than this many bytes for performance purposes
const sizeLimit = 1024 * 1024

type globalVarsType struct {
highlightMapping map[string]string
githubStyles *chroma.Style
escapeFull []template.HTML
escCtrlCharsMap []template.HTML
}

var (
Expand All @@ -44,10 +42,69 @@ func globalVars() *globalVarsType {
globalVarsPtr = &globalVarsType{}
globalVarsPtr.githubStyles = styles.Get("github")
globalVarsPtr.highlightMapping = setting.GetHighlightMapping()
globalVarsPtr.escCtrlCharsMap = make([]template.HTML, 256)
// ASCII Table 0x00 - 0x1F
controlCharNames := []string{
"NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL",
"BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI",
"DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB",
"CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US",
}
// Uncomment this line if you'd debug the layout without creating a special file, then Space (0x20) will also be escaped.
// Don't worry, even if you forget to comment it out and push it to git repo, the CI tests will catch it and fail.
// controlCharNames = append(controlCharNames, "SP")
for i, s := range controlCharNames {
globalVarsPtr.escCtrlCharsMap[i] = template.HTML(`<span class="broken-code-point" data-escaped="` + s + `"><span class="char">` + string(byte(i)) + `</span></span>`)
}
globalVarsPtr.escCtrlCharsMap[0x7f] = template.HTML(`<span class="broken-code-point" data-escaped="DEL"><span class="char">` + string(byte(0x7f)) + `</span></span>`)
globalVarsPtr.escCtrlCharsMap['\t'] = ""
globalVarsPtr.escCtrlCharsMap['\n'] = ""
globalVarsPtr.escCtrlCharsMap['\r'] = ""

globalVarsPtr.escapeFull = slices.Clone(globalVarsPtr.escCtrlCharsMap)
// exactly the same as Golang's html.EscapeString
globalVarsPtr.escapeFull['&'] = "&amp;"
globalVarsPtr.escapeFull['\''] = "&#39;"
globalVarsPtr.escapeFull['<'] = "&lt;"
globalVarsPtr.escapeFull['>'] = "&gt;"
globalVarsPtr.escapeFull['"'] = "&#34;"
}
return globalVarsPtr
}

func escapeByMap(code []byte, escapeMap []template.HTML) template.HTML {
firstEscapePos := -1
for i, c := range code {
if escapeMap[c] != "" {
firstEscapePos = i
break
}
}
if firstEscapePos == -1 {
return template.HTML(util.UnsafeBytesToString(code))
}

buf := make([]byte, firstEscapePos, len(code)*2)
copy(buf[:firstEscapePos], code[:firstEscapePos])
for i := firstEscapePos; i < len(code); i++ {
c := code[i]
if esc := escapeMap[c]; esc != "" {
buf = append(buf, esc...)
} else {
buf = append(buf, c)
}
}
return template.HTML(util.UnsafeBytesToString(buf))
}

func escapeFullString(code string) template.HTML {
return escapeByMap(util.UnsafeStringToBytes(code), globalVars().escapeFull)
}

func escapeControlChars(code []byte) template.HTML {
return escapeByMap(code, globalVars().escCtrlCharsMap)
}

// UnsafeSplitHighlightedLines splits highlighted code into lines preserving HTML tags
// It always includes '\n', '\n' can appear at the end of each line or in the middle of HTML tags
// The '\n' is necessary for copying code from web UI to preserve original code lines
Expand Down Expand Up @@ -90,7 +147,7 @@ func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML,
}

if len(code) > sizeLimit {
return template.HTML(template.HTMLEscapeString(code)), nil, ""
return escapeFullString(code), nil, ""
}

lexer = detectChromaLexerWithAnalyze(fileName, language, util.UnsafeStringToBytes(code)) // it is also slow
Expand All @@ -104,86 +161,66 @@ func RenderCodeByLexer(lexer chroma.Lexer, code string) template.HTML {
html.PreventSurroundingPre(true),
)

htmlbuf := bytes.Buffer{}
htmlw := bufio.NewWriter(&htmlbuf)

iterator, err := lexer.Tokenise(nil, code)
if err != nil {
log.Error("Can't tokenize code: %v", err)
return template.HTML(template.HTMLEscapeString(code))
return escapeFullString(code)
}

htmlBuf := &bytes.Buffer{}
// style not used for live site but need to pass something
err = formatter.Format(htmlw, globalVars().githubStyles, iterator)
err = formatter.Format(htmlBuf, globalVars().githubStyles, iterator)
if err != nil {
log.Error("Can't format code: %v", err)
return template.HTML(template.HTMLEscapeString(code))
return escapeFullString(code)
}

_ = htmlw.Flush()
// Chroma will add newlines for certain lexers in order to highlight them properly
// Once highlighted, strip them here, so they don't cause copy/paste trouble in HTML output
return template.HTML(strings.TrimSuffix(htmlbuf.String(), "\n"))
// At the moment, we do not escape control chars here (unlike RenderFullFile which escapes control chars).
// The reason is: it is a very rare case that a text file contains control chars.
// This function is usually used by highlight diff and blame, not quite sure whether there will be side effects.
// If there would be new user feedback about this, we can re-consider about various edge cases.
return template.HTML(htmlBuf.String())
}

// RenderFullFile returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name
func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, string, error) {
if len(code) > sizeLimit {
return RenderPlainText(code), "", nil
func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, string) {
if language == LanguagePlaintext || len(code) > sizeLimit {
return renderPlainText(code), formatLexerName(LanguagePlaintext)
}

formatter := html.New(html.WithClasses(true),
html.WithLineNumbers(false),
html.PreventSurroundingPre(true),
)

lexer := detectChromaLexerWithAnalyze(fileName, language, code)
lexerName := formatLexerName(lexer.Config().Name)

iterator, err := lexer.Tokenise(nil, string(code))
if err != nil {
return nil, "", fmt.Errorf("can't tokenize code: %w", err)
rendered := RenderCodeByLexer(lexer, util.UnsafeBytesToString(code))
unsafeLines := UnsafeSplitHighlightedLines(rendered)
lines := make([]template.HTML, 0, len(unsafeLines))
for _, lineBytes := range unsafeLines {
line := escapeControlChars(lineBytes)
lines = append(lines, line)
}

tokensLines := chroma.SplitTokensIntoLines(iterator.Tokens())
htmlBuf := &bytes.Buffer{}

lines := make([]template.HTML, 0, len(tokensLines))
for _, tokens := range tokensLines {
iterator = chroma.Literator(tokens...)
err = formatter.Format(htmlBuf, globalVars().githubStyles, iterator)
if err != nil {
return nil, "", fmt.Errorf("can't format code: %w", err)
}
lines = append(lines, template.HTML(htmlBuf.String()))
htmlBuf.Reset()
}

return lines, lexerName, nil
return lines, lexerName
}

// RenderPlainText returns non-highlighted HTML for code
func RenderPlainText(code []byte) []template.HTML {
r := bufio.NewReader(bytes.NewReader(code))
m := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1)
for {
content, err := r.ReadString('\n')
if err != nil && err != io.EOF {
log.Error("failed to read string from buffer: %v", err)
break
}
if content == "" && err == io.EOF {
break
// renderPlainText returns non-highlighted HTML for code
func renderPlainText(code []byte) []template.HTML {
lines := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1)
pos := 0
for pos < len(code) {
var content []byte
nextPos := bytes.IndexByte(code[pos:], '\n')
if nextPos == -1 {
content = code[pos:]
pos = len(code)
} else {
content = code[pos : pos+nextPos+1]
pos += nextPos + 1
}
s := template.HTML(gohtml.EscapeString(content))
m = append(m, s)
lines = append(lines, escapeFullString(util.UnsafeBytesToString(content)))
}
return m
return lines
}

func formatLexerName(name string) string {
if name == "fallback" {
if name == LanguagePlaintext || name == chromaLexerFallback {
return "Plaintext"
}

return util.ToTitleCaseNoLower(name)
}
16 changes: 13 additions & 3 deletions modules/highlight/highlight_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,7 @@ c=2

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
out, lexerName, err := RenderFullFile(tt.name, "", []byte(tt.code))
assert.NoError(t, err)
out, lexerName := RenderFullFile(tt.name, "", []byte(tt.code))
assert.Equal(t, tt.want, out)
assert.Equal(t, tt.lexerName, lexerName)
})
Expand Down Expand Up @@ -182,7 +181,7 @@ c=2`),

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
out := RenderPlainText([]byte(tt.code))
out := renderPlainText([]byte(tt.code))
assert.Equal(t, tt.want, out)
})
}
Expand All @@ -205,3 +204,14 @@ func TestUnsafeSplitHighlightedLines(t *testing.T) {
assert.Equal(t, "<span>a</span>\n", string(ret[0]))
assert.Equal(t, "<span>b\n</span>", string(ret[1]))
}

func TestEscape(t *testing.T) {
assert.Equal(t, template.HTML("\t\r\n<span class=\"broken-code-point\" data-escaped=\"NUL\"><span class=\"char\">\x00</span></span><span class=\"broken-code-point\" data-escaped=\"US\"><span class=\"char\">\x1f</span></span>&'\"<>"), escapeControlChars([]byte("\t\r\n\x00\x1f&'\"<>")))
assert.Equal(t, template.HTML("<span class=\"broken-code-point\" data-escaped=\"NUL\"><span class=\"char\">\x00</span></span><span class=\"broken-code-point\" data-escaped=\"US\"><span class=\"char\">\x1f</span></span>&amp;&#39;&#34;&lt;&gt;\t\r\n"), escapeFullString("\x00\x1f&'\"<>\t\r\n"))

out, _ := RenderFullFile("a.py", "", []byte("# \x7f<>"))
assert.Equal(t, template.HTML(`<span class="c1"># <span class="broken-code-point" data-escaped="DEL"><span class="char">`+string(byte(0x7f))+`</span></span>&lt;&gt;</span>`), out[0])

out = renderPlainText([]byte("# \x7f<>"))
assert.Equal(t, template.HTML(`# <span class="broken-code-point" data-escaped="DEL"><span class="char">`+string(byte(0x7f))+`</span></span>&lt;&gt;`), out[0])
}
6 changes: 5 additions & 1 deletion modules/highlight/lexerdetect.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ import (
"github.com/go-enry/go-enry/v2"
)

const mapKeyLowerPrefix = "lower/"
const (
mapKeyLowerPrefix = "lower/"
LanguagePlaintext = "plaintext"
chromaLexerFallback = "fallback"
)

// chromaLexers is fully managed by us to do fast lookup for chroma lexers by file name or language name
// Don't use lexers.Get because it is very slow in many cases (iterate all rules, filepath glob match, etc.)
Expand Down
6 changes: 1 addition & 5 deletions routers/web/repo/view_file.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,8 @@ func handleFileViewRenderSource(ctx *context.Context, attrs *attribute.Attribute
}

language := attrs.GetLanguage().Value()
fileContent, lexerName, err := highlight.RenderFullFile(filename, language, buf)
fileContent, lexerName := highlight.RenderFullFile(filename, language, buf)
ctx.Data["LexerName"] = lexerName
if err != nil {
log.Error("highlight.RenderFullFile failed, fallback to plain text: %v", err)
fileContent = highlight.RenderPlainText(buf)
}
status := &charset.EscapeStatus{}
statuses := make([]*charset.EscapeStatus, len(fileContent))
for i, line := range fileContent {
Expand Down
2 changes: 1 addition & 1 deletion services/gitdiff/gitdiff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1140,7 +1140,7 @@ func TestHighlightCodeLines(t *testing.T) {
ret := highlightCodeLinesForDiffFile(diffFile, true, []byte("a\nb\n"))
assert.Equal(t, map[int]template.HTML{
0: `<span class="n">a</span>` + nl,
1: `<span class="n">b</span>`,
1: `<span class="n">b</span>` + nl,
}, ret)
})
}
Expand Down
1 change: 1 addition & 0 deletions web_src/css/index.css
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
@import "./modules/flexcontainer.css";
@import "./modules/codeeditor.css";
@import "./modules/chroma.css";
@import "./modules/charescape.css";

@import "./shared/flex-list.css";
@import "./shared/milestone.css";
Expand Down
48 changes: 48 additions & 0 deletions web_src/css/modules/charescape.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
Show the escaped and hide the real char:
<span class="broken-code-point" data-escaped="DEL"><span class="char">{real-char}</span></span>
Only show the real-char:
<span class="broken-code-point">{real-char}</span>
*/
.broken-code-point:not([data-escaped]),
.broken-code-point[data-escaped]::before {
border-radius: 4px;
padding: 0 2px;
color: var(--color-body);
background: var(--color-text-light-1);
}

.broken-code-point[data-escaped]::before {
visibility: visible;
content: attr(data-escaped);
}
.broken-code-point[data-escaped] .char {
/* make it copyable by selecting the text (AI suggestion, no other solution) */
position: absolute;
opacity: 0;
pointer-events: none;
}

/*
Show the escaped and hide the real-char:
<span class="unicode-escaped">
<span class="escaped-code-point" data-escaped="U+1F600"><span class="char">{real-char}</span></span>
</span>
Hide the escaped and show the real-char:
<span>
<span class="escaped-code-point" data-escaped="U+1F600"><span class="char">{real-char}</span></span>
</span>
*/
.unicode-escaped .escaped-code-point[data-escaped]::before {
visibility: visible;
content: attr(data-escaped);
color: var(--color-red);
}

.unicode-escaped .escaped-code-point .char {
display: none;
}

.unicode-escaped .ambiguous-code-point {
border: 1px var(--color-yellow) solid;
}
20 changes: 0 additions & 20 deletions web_src/css/repo.css
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,6 @@
min-width: 40% !important;
}

.repository .unicode-escaped .escaped-code-point[data-escaped]::before {
visibility: visible;
content: attr(data-escaped);
font-family: var(--fonts-monospace);
color: var(--color-red);
}

.repository .unicode-escaped .escaped-code-point .char {
display: none;
}

.repository .broken-code-point {
font-family: var(--fonts-monospace);
color: var(--color-blue);
}

.repository .unicode-escaped .ambiguous-code-point {
border: 1px var(--color-yellow) solid;
}

.issue-content {
display: flex;
align-items: flex-start;
Expand Down