Skip to content

Commit 4431b1f

Browse files
committed
add more tests
1 parent 1eb893a commit 4431b1f

File tree

2 files changed

+73
-11
lines changed

2 files changed

+73
-11
lines changed

modules/charset/escape_stream.go

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,21 @@ import (
1616
"golang.org/x/net/html"
1717
)
1818

19+
type htmlChunkReader struct {
20+
in io.Reader
21+
readErr error
22+
readBuf []byte
23+
curInTag bool
24+
}
25+
1926
type escapeStreamer struct {
27+
htmlChunkReader
28+
2029
escaped *EscapeStatus
2130
locale translation.Locale
2231
ambiguousTables []*AmbiguousTable
2332
allowed map[rune]bool
2433

25-
in io.Reader
26-
readErr error
27-
readBuf []byte
28-
curInTag bool
29-
3034
out io.Writer
3135
}
3236

@@ -35,10 +39,10 @@ func escapeStream(locale translation.Locale, in io.Reader, out io.Writer, opts .
3539
escaped: &EscapeStatus{},
3640
locale: locale,
3741
ambiguousTables: AmbiguousTablesForLocale(locale),
38-
39-
in: in,
40-
readBuf: make([]byte, 0, 32*1024),
41-
42+
htmlChunkReader: htmlChunkReader{
43+
in: in,
44+
readBuf: make([]byte, 0, 32*1024),
45+
},
4246
out: out,
4347
}
4448

@@ -134,7 +138,7 @@ func (e *escapeStreamer) detectAndWriteRunes(part []byte) error {
134138
return e.writeDetectResults(part, results)
135139
}
136140

137-
func (e *escapeStreamer) readRunes() (parts [][]byte, partInTag []bool, _ error) {
141+
func (e *htmlChunkReader) readRunes() (parts [][]byte, partInTag []bool, _ error) {
138142
// we have read everything, eof
139143
if e.readErr != nil && len(e.readBuf) == 0 {
140144
return nil, nil, e.readErr
@@ -195,7 +199,7 @@ func (e *escapeStreamer) readRunes() (parts [][]byte, partInTag []bool, _ error)
195199
// now we get the curPart bytes, but we can't directly use it, the last rune in it might have been cut
196200
// try to decode the last rune, if it's invalid, then we cut the last byte and try again until we get a valid rune or no byte left
197201
for i := curPartLen - 1; i >= 0; i-- {
198-
last, lastSize := utf8.DecodeLastRune(curPart[i:])
202+
last, lastSize := utf8.DecodeRune(curPart[i:])
199203
if last == utf8.RuneError && lastSize == 1 {
200204
curPartLen--
201205
} else {
@@ -210,13 +214,23 @@ func (e *escapeStreamer) readRunes() (parts [][]byte, partInTag []bool, _ error)
210214
// * at least consume 1 byte to avoid infinite loop
211215
curPartLen = max(len(curPart)-utf8.UTFMax, 1)
212216
}
217+
218+
// if curPartLen is not the same as curPart, it means we have cut some bytes,
219+
// need to wait for more data if not eof
220+
trailingCorrupted := curPartLen != len(curPart)
221+
213222
// finally, we get the real part we need
214223
curPart = curPart[:curPartLen]
215224
parts = append(parts, curPart)
216225
partInTag = append(partInTag, e.curInTag)
217226

218227
pos += curPartLen
219228
e.curInTag = nextInTag
229+
230+
if trailingCorrupted && e.readErr == nil {
231+
// if the last part is corrupted, and we haven't reach eof, then we need to wait for more data to get the complete part
232+
break
233+
}
220234
}
221235

222236
copy(e.readBuf, e.readBuf[pos:])

modules/charset/escape_test.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"code.gitea.io/gitea/modules/translation"
1414

1515
"github.com/stretchr/testify/assert"
16+
"github.com/stretchr/testify/require"
1617
)
1718

1819
type escapeControlTest struct {
@@ -165,3 +166,50 @@ func TestSettingAmbiguousUnicodeDetection(t *testing.T) {
165166
_, out = EscapeControlHTML("a test", &translation.MockLocale{})
166167
assert.EqualValues(t, `a test`, out)
167168
}
169+
170+
func TestHTMLChunkReader(t *testing.T) {
171+
type textPart struct {
172+
text string
173+
isTag bool
174+
}
175+
testReadChunks := func(t *testing.T, chunkSize int, input string, expected []textPart) {
176+
r := &htmlChunkReader{in: strings.NewReader(input), readBuf: make([]byte, 0, chunkSize)}
177+
var results []textPart
178+
for {
179+
parts, partIsTag, err := r.readRunes()
180+
if err != nil {
181+
break
182+
}
183+
for i, part := range parts {
184+
results = append(results, textPart{string(part), partIsTag[i]})
185+
}
186+
}
187+
assert.Equal(t, expected, results, "chunk size: %d, input: %s", chunkSize, input)
188+
}
189+
190+
testReadChunks(t, 10, "abc<def>ghi", []textPart{
191+
{text: "abc", isTag: false},
192+
{text: "<def>", isTag: true},
193+
{text: "gh", isTag: false},
194+
// -- chunk
195+
{text: "i", isTag: false},
196+
})
197+
198+
testReadChunks(t, 10, "<abc><def>ghi", []textPart{
199+
{text: "<abc>", isTag: true},
200+
{text: "<def>", isTag: true},
201+
// -- chunk
202+
{text: "ghi", isTag: false},
203+
})
204+
205+
rune1, rune2, rune3, rune4 := "A", "é", "啊", "🌞"
206+
require.Len(t, rune1, 1)
207+
require.Len(t, rune2, 2)
208+
require.Len(t, rune3, 3)
209+
require.Len(t, rune4, 4)
210+
input := "<" + rune1 + rune2 + rune3 + rune4 + ">" + rune1 + rune2 + rune3 + rune4
211+
testReadChunks(t, 4, input, []textPart{{"<Aé", true}, {"啊", true}, {"🌞", true}, {">", true}, {"Aé", false}, {"啊", false}, {"🌞", false}})
212+
testReadChunks(t, 5, input, []textPart{{"<Aé", true}, {"啊", true}, {"🌞>", true}, {"Aé", false}, {"啊", false}, {"🌞", false}})
213+
testReadChunks(t, 6, input, []textPart{{"<Aé", true}, {"啊", true}, {"🌞>", true}, {"A", false}, {"é啊", false}, {"🌞", false}})
214+
testReadChunks(t, 7, input, []textPart{{"<Aé啊", true}, {"🌞>", true}, {"A", false}, {"é啊", false}, {"🌞", false}})
215+
}

0 commit comments

Comments
 (0)