Skip to content

Commit 6eed75a

Browse files
authored
Refactor code render and render control chars (#37078)
Fix #37057
1 parent 7b17234 commit 6eed75a

File tree

8 files changed

+169
-93
lines changed

8 files changed

+169
-93
lines changed

modules/highlight/highlight.go

Lines changed: 100 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,9 @@
55
package highlight
66

77
import (
8-
"bufio"
98
"bytes"
10-
"fmt"
11-
gohtml "html"
129
"html/template"
13-
"io"
14-
"strings"
10+
"slices"
1511
"sync"
1612

1713
"code.gitea.io/gitea/modules/log"
@@ -23,12 +19,14 @@ import (
2319
"github.com/alecthomas/chroma/v2/styles"
2420
)
2521

26-
// don't index files larger than this many bytes for performance purposes
22+
// don't highlight files larger than this many bytes for performance purposes
2723
const sizeLimit = 1024 * 1024
2824

2925
type globalVarsType struct {
3026
highlightMapping map[string]string
3127
githubStyles *chroma.Style
28+
escapeFull []template.HTML
29+
escCtrlCharsMap []template.HTML
3230
}
3331

3432
var (
@@ -44,10 +42,69 @@ func globalVars() *globalVarsType {
4442
globalVarsPtr = &globalVarsType{}
4543
globalVarsPtr.githubStyles = styles.Get("github")
4644
globalVarsPtr.highlightMapping = setting.GetHighlightMapping()
45+
globalVarsPtr.escCtrlCharsMap = make([]template.HTML, 256)
46+
// ASCII Table 0x00 - 0x1F
47+
controlCharNames := []string{
48+
"NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL",
49+
"BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI",
50+
"DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB",
51+
"CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US",
52+
}
53+
// Uncomment this line if you'd debug the layout without creating a special file, then Space (0x20) will also be escaped.
54+
// Don't worry, even if you forget to comment it out and push it to git repo, the CI tests will catch it and fail.
55+
// controlCharNames = append(controlCharNames, "SP")
56+
for i, s := range controlCharNames {
57+
globalVarsPtr.escCtrlCharsMap[i] = template.HTML(`<span class="broken-code-point" data-escaped="` + s + `"><span class="char">` + string(byte(i)) + `</span></span>`)
58+
}
59+
globalVarsPtr.escCtrlCharsMap[0x7f] = template.HTML(`<span class="broken-code-point" data-escaped="DEL"><span class="char">` + string(byte(0x7f)) + `</span></span>`)
60+
globalVarsPtr.escCtrlCharsMap['\t'] = ""
61+
globalVarsPtr.escCtrlCharsMap['\n'] = ""
62+
globalVarsPtr.escCtrlCharsMap['\r'] = ""
63+
64+
globalVarsPtr.escapeFull = slices.Clone(globalVarsPtr.escCtrlCharsMap)
65+
// exactly the same as Golang's html.EscapeString
66+
globalVarsPtr.escapeFull['&'] = "&amp;"
67+
globalVarsPtr.escapeFull['\''] = "&#39;"
68+
globalVarsPtr.escapeFull['<'] = "&lt;"
69+
globalVarsPtr.escapeFull['>'] = "&gt;"
70+
globalVarsPtr.escapeFull['"'] = "&#34;"
4771
}
4872
return globalVarsPtr
4973
}
5074

75+
func escapeByMap(code []byte, escapeMap []template.HTML) template.HTML {
76+
firstEscapePos := -1
77+
for i, c := range code {
78+
if escapeMap[c] != "" {
79+
firstEscapePos = i
80+
break
81+
}
82+
}
83+
if firstEscapePos == -1 {
84+
return template.HTML(util.UnsafeBytesToString(code))
85+
}
86+
87+
buf := make([]byte, firstEscapePos, len(code)*2)
88+
copy(buf[:firstEscapePos], code[:firstEscapePos])
89+
for i := firstEscapePos; i < len(code); i++ {
90+
c := code[i]
91+
if esc := escapeMap[c]; esc != "" {
92+
buf = append(buf, esc...)
93+
} else {
94+
buf = append(buf, c)
95+
}
96+
}
97+
return template.HTML(util.UnsafeBytesToString(buf))
98+
}
99+
100+
func escapeFullString(code string) template.HTML {
101+
return escapeByMap(util.UnsafeStringToBytes(code), globalVars().escapeFull)
102+
}
103+
104+
func escapeControlChars(code []byte) template.HTML {
105+
return escapeByMap(code, globalVars().escCtrlCharsMap)
106+
}
107+
51108
// UnsafeSplitHighlightedLines splits highlighted code into lines preserving HTML tags
52109
// It always includes '\n', '\n' can appear at the end of each line or in the middle of HTML tags
53110
// The '\n' is necessary for copying code from web UI to preserve original code lines
@@ -90,7 +147,7 @@ func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML,
90147
}
91148

92149
if len(code) > sizeLimit {
93-
return template.HTML(template.HTMLEscapeString(code)), nil, ""
150+
return escapeFullString(code), nil, ""
94151
}
95152

96153
lexer = detectChromaLexerWithAnalyze(fileName, language, util.UnsafeStringToBytes(code)) // it is also slow
@@ -104,86 +161,66 @@ func RenderCodeByLexer(lexer chroma.Lexer, code string) template.HTML {
104161
html.PreventSurroundingPre(true),
105162
)
106163

107-
htmlbuf := bytes.Buffer{}
108-
htmlw := bufio.NewWriter(&htmlbuf)
109-
110164
iterator, err := lexer.Tokenise(nil, code)
111165
if err != nil {
112166
log.Error("Can't tokenize code: %v", err)
113-
return template.HTML(template.HTMLEscapeString(code))
167+
return escapeFullString(code)
114168
}
169+
170+
htmlBuf := &bytes.Buffer{}
115171
// style not used for live site but need to pass something
116-
err = formatter.Format(htmlw, globalVars().githubStyles, iterator)
172+
err = formatter.Format(htmlBuf, globalVars().githubStyles, iterator)
117173
if err != nil {
118174
log.Error("Can't format code: %v", err)
119-
return template.HTML(template.HTMLEscapeString(code))
175+
return escapeFullString(code)
120176
}
121177

122-
_ = htmlw.Flush()
123-
// Chroma will add newlines for certain lexers in order to highlight them properly
124-
// Once highlighted, strip them here, so they don't cause copy/paste trouble in HTML output
125-
return template.HTML(strings.TrimSuffix(htmlbuf.String(), "\n"))
178+
// At the moment, we do not escape control chars here (unlike RenderFullFile which escapes control chars).
179+
// The reason is: it is a very rare case that a text file contains control chars.
180+
// This function is usually used by highlight diff and blame, not quite sure whether there will be side effects.
181+
// If there would be new user feedback about this, we can re-consider about various edge cases.
182+
return template.HTML(htmlBuf.String())
126183
}
127184

128185
// RenderFullFile returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name
129-
func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, string, error) {
130-
if len(code) > sizeLimit {
131-
return RenderPlainText(code), "", nil
186+
func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, string) {
187+
if language == LanguagePlaintext || len(code) > sizeLimit {
188+
return renderPlainText(code), formatLexerName(LanguagePlaintext)
132189
}
133-
134-
formatter := html.New(html.WithClasses(true),
135-
html.WithLineNumbers(false),
136-
html.PreventSurroundingPre(true),
137-
)
138-
139190
lexer := detectChromaLexerWithAnalyze(fileName, language, code)
140191
lexerName := formatLexerName(lexer.Config().Name)
141-
142-
iterator, err := lexer.Tokenise(nil, string(code))
143-
if err != nil {
144-
return nil, "", fmt.Errorf("can't tokenize code: %w", err)
192+
rendered := RenderCodeByLexer(lexer, util.UnsafeBytesToString(code))
193+
unsafeLines := UnsafeSplitHighlightedLines(rendered)
194+
lines := make([]template.HTML, 0, len(unsafeLines))
195+
for _, lineBytes := range unsafeLines {
196+
line := escapeControlChars(lineBytes)
197+
lines = append(lines, line)
145198
}
146-
147-
tokensLines := chroma.SplitTokensIntoLines(iterator.Tokens())
148-
htmlBuf := &bytes.Buffer{}
149-
150-
lines := make([]template.HTML, 0, len(tokensLines))
151-
for _, tokens := range tokensLines {
152-
iterator = chroma.Literator(tokens...)
153-
err = formatter.Format(htmlBuf, globalVars().githubStyles, iterator)
154-
if err != nil {
155-
return nil, "", fmt.Errorf("can't format code: %w", err)
156-
}
157-
lines = append(lines, template.HTML(htmlBuf.String()))
158-
htmlBuf.Reset()
159-
}
160-
161-
return lines, lexerName, nil
199+
return lines, lexerName
162200
}
163201

164-
// RenderPlainText returns non-highlighted HTML for code
165-
func RenderPlainText(code []byte) []template.HTML {
166-
r := bufio.NewReader(bytes.NewReader(code))
167-
m := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1)
168-
for {
169-
content, err := r.ReadString('\n')
170-
if err != nil && err != io.EOF {
171-
log.Error("failed to read string from buffer: %v", err)
172-
break
173-
}
174-
if content == "" && err == io.EOF {
175-
break
202+
// renderPlainText returns non-highlighted HTML for code
203+
func renderPlainText(code []byte) []template.HTML {
204+
lines := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1)
205+
pos := 0
206+
for pos < len(code) {
207+
var content []byte
208+
nextPos := bytes.IndexByte(code[pos:], '\n')
209+
if nextPos == -1 {
210+
content = code[pos:]
211+
pos = len(code)
212+
} else {
213+
content = code[pos : pos+nextPos+1]
214+
pos += nextPos + 1
176215
}
177-
s := template.HTML(gohtml.EscapeString(content))
178-
m = append(m, s)
216+
lines = append(lines, escapeFullString(util.UnsafeBytesToString(content)))
179217
}
180-
return m
218+
return lines
181219
}
182220

183221
func formatLexerName(name string) string {
184-
if name == "fallback" {
222+
if name == LanguagePlaintext || name == chromaLexerFallback {
185223
return "Plaintext"
186224
}
187-
188225
return util.ToTitleCaseNoLower(name)
189226
}

modules/highlight/highlight_test.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,7 @@ c=2
118118

119119
for _, tt := range tests {
120120
t.Run(tt.name, func(t *testing.T) {
121-
out, lexerName, err := RenderFullFile(tt.name, "", []byte(tt.code))
122-
assert.NoError(t, err)
121+
out, lexerName := RenderFullFile(tt.name, "", []byte(tt.code))
123122
assert.Equal(t, tt.want, out)
124123
assert.Equal(t, tt.lexerName, lexerName)
125124
})
@@ -182,7 +181,7 @@ c=2`),
182181

183182
for _, tt := range tests {
184183
t.Run(tt.name, func(t *testing.T) {
185-
out := RenderPlainText([]byte(tt.code))
184+
out := renderPlainText([]byte(tt.code))
186185
assert.Equal(t, tt.want, out)
187186
})
188187
}
@@ -205,3 +204,14 @@ func TestUnsafeSplitHighlightedLines(t *testing.T) {
205204
assert.Equal(t, "<span>a</span>\n", string(ret[0]))
206205
assert.Equal(t, "<span>b\n</span>", string(ret[1]))
207206
}
207+
208+
func TestEscape(t *testing.T) {
209+
assert.Equal(t, template.HTML("\t\r\n<span class=\"broken-code-point\" data-escaped=\"NUL\"><span class=\"char\">\x00</span></span><span class=\"broken-code-point\" data-escaped=\"US\"><span class=\"char\">\x1f</span></span>&'\"<>"), escapeControlChars([]byte("\t\r\n\x00\x1f&'\"<>")))
210+
assert.Equal(t, template.HTML("<span class=\"broken-code-point\" data-escaped=\"NUL\"><span class=\"char\">\x00</span></span><span class=\"broken-code-point\" data-escaped=\"US\"><span class=\"char\">\x1f</span></span>&amp;&#39;&#34;&lt;&gt;\t\r\n"), escapeFullString("\x00\x1f&'\"<>\t\r\n"))
211+
212+
out, _ := RenderFullFile("a.py", "", []byte("# \x7f<>"))
213+
assert.Equal(t, template.HTML(`<span class="c1"># <span class="broken-code-point" data-escaped="DEL"><span class="char">`+string(byte(0x7f))+`</span></span>&lt;&gt;</span>`), out[0])
214+
215+
out = renderPlainText([]byte("# \x7f<>"))
216+
assert.Equal(t, template.HTML(`# <span class="broken-code-point" data-escaped="DEL"><span class="char">`+string(byte(0x7f))+`</span></span>&lt;&gt;`), out[0])
217+
}

modules/highlight/lexerdetect.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@ import (
1616
"github.com/go-enry/go-enry/v2"
1717
)
1818

19-
const mapKeyLowerPrefix = "lower/"
19+
const (
20+
mapKeyLowerPrefix = "lower/"
21+
LanguagePlaintext = "plaintext"
22+
chromaLexerFallback = "fallback"
23+
)
2024

2125
// chromaLexers is fully managed by us to do fast lookup for chroma lexers by file name or language name
2226
// Don't use lexers.Get because it is very slow in many cases (iterate all rules, filepath glob match, etc.)

routers/web/repo/view_file.go

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,8 @@ func handleFileViewRenderSource(ctx *context.Context, attrs *attribute.Attribute
119119
}
120120

121121
language := attrs.GetLanguage().Value()
122-
fileContent, lexerName, err := highlight.RenderFullFile(filename, language, buf)
122+
fileContent, lexerName := highlight.RenderFullFile(filename, language, buf)
123123
ctx.Data["LexerName"] = lexerName
124-
if err != nil {
125-
log.Error("highlight.RenderFullFile failed, fallback to plain text: %v", err)
126-
fileContent = highlight.RenderPlainText(buf)
127-
}
128124
status := &charset.EscapeStatus{}
129125
statuses := make([]*charset.EscapeStatus, len(fileContent))
130126
for i, line := range fileContent {

services/gitdiff/gitdiff_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1140,7 +1140,7 @@ func TestHighlightCodeLines(t *testing.T) {
11401140
ret := highlightCodeLinesForDiffFile(diffFile, true, []byte("a\nb\n"))
11411141
assert.Equal(t, map[int]template.HTML{
11421142
0: `<span class="n">a</span>` + nl,
1143-
1: `<span class="n">b</span>`,
1143+
1: `<span class="n">b</span>` + nl,
11441144
}, ret)
11451145
})
11461146
}

web_src/css/index.css

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
@import "./modules/flexcontainer.css";
3434
@import "./modules/codeeditor.css";
3535
@import "./modules/chroma.css";
36+
@import "./modules/charescape.css";
3637

3738
@import "./shared/flex-list.css";
3839
@import "./shared/milestone.css";

web_src/css/modules/charescape.css

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
Show the escaped and hide the real char:
3+
<span class="broken-code-point" data-escaped="DEL"><span class="char">{real-char}</span></span>
4+
Only show the real-char:
5+
<span class="broken-code-point">{real-char}</span>
6+
*/
7+
.broken-code-point:not([data-escaped]),
8+
.broken-code-point[data-escaped]::before {
9+
border-radius: 4px;
10+
padding: 0 2px;
11+
color: var(--color-body);
12+
background: var(--color-text-light-1);
13+
}
14+
15+
.broken-code-point[data-escaped]::before {
16+
visibility: visible;
17+
content: attr(data-escaped);
18+
}
19+
.broken-code-point[data-escaped] .char {
20+
/* make it copyable by selecting the text (AI suggestion, no other solution) */
21+
position: absolute;
22+
opacity: 0;
23+
pointer-events: none;
24+
}
25+
26+
/*
27+
Show the escaped and hide the real-char:
28+
<span class="unicode-escaped">
29+
<span class="escaped-code-point" data-escaped="U+1F600"><span class="char">{real-char}</span></span>
30+
</span>
31+
Hide the escaped and show the real-char:
32+
<span>
33+
<span class="escaped-code-point" data-escaped="U+1F600"><span class="char">{real-char}</span></span>
34+
</span>
35+
*/
36+
.unicode-escaped .escaped-code-point[data-escaped]::before {
37+
visibility: visible;
38+
content: attr(data-escaped);
39+
color: var(--color-red);
40+
}
41+
42+
.unicode-escaped .escaped-code-point .char {
43+
display: none;
44+
}
45+
46+
.unicode-escaped .ambiguous-code-point {
47+
border: 1px var(--color-yellow) solid;
48+
}

web_src/css/repo.css

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,26 +8,6 @@
88
min-width: 40% !important;
99
}
1010

11-
.repository .unicode-escaped .escaped-code-point[data-escaped]::before {
12-
visibility: visible;
13-
content: attr(data-escaped);
14-
font-family: var(--fonts-monospace);
15-
color: var(--color-red);
16-
}
17-
18-
.repository .unicode-escaped .escaped-code-point .char {
19-
display: none;
20-
}
21-
22-
.repository .broken-code-point {
23-
font-family: var(--fonts-monospace);
24-
color: var(--color-blue);
25-
}
26-
27-
.repository .unicode-escaped .ambiguous-code-point {
28-
border: 1px var(--color-yellow) solid;
29-
}
30-
3111
.issue-content {
3212
display: flex;
3313
align-items: flex-start;

0 commit comments

Comments
 (0)