Skip to content

Commit 0260ba5

Browse files
dzfriasstevengj
andauthored
Fix attempting to combine Hangul Jamo 0x11a7 (#317)
* Fix attempting to combine Hangul Jamo 0x11a7 0x11a7 is not a valid Hangul T syllable despite being equal to T_BASE. This is because, per the Unicode spec: TCount is set to one more than the number of trailing consonants relevant to the decomposition algorithm: (0x11C2 - 0x11A8 + 1) + 1 So the first valid Hangul T syllable is 0x11a8. Also see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G59434 for where the spec describes the usage of 0x11a8, not 0x11a7, during composition. * document that utf8proc_map simply wraps utf8proc_decompose and utf8proc_reencode (#312) * test code refactoring (#318) * Write regression test for #317 --------- Co-authored-by: Steven G. Johnson <stevenj@alum.mit.edu>
1 parent 3460568 commit 0260ba5

2 files changed

Lines changed: 21 additions & 1 deletion

File tree

test/misc.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,30 @@ static void issue102(void) /* #102 */
2525
check_compare("NFKC_Casefold", input, correct, utf8proc_NFKC_Casefold(input), 1);
2626
}
2727

28+
static void issue317(void) /* #317 */
29+
{
30+
utf8proc_uint8_t input[] = {0xec, 0xa3, 0xa0, 0xe1, 0x86, 0xa7, 0x00}; /* "\uc8e0\u11a7" */
31+
utf8proc_uint8_t combined[] = {0xec, 0xa3, 0xa, 0x00}; /* "\uc8e1" */
32+
utf8proc_int32_t codepoint;
33+
34+
/* inputs that should *not* be combined* */
35+
check_compare("NFC", input, input, utf8proc_NFC(input), 1);
36+
utf8proc_encode_char(0x11c3, input+3);
37+
check_compare("NFC", input, input, utf8proc_NFC(input), 1);
38+
39+
/* inputs that *should* be combined (TCOUNT-1 chars starting at TBASE+1) */
40+
for (codepoint = 0x11a8; codepoint < 0x11c3; ++codepoint) {
41+
utf8proc_encode_char(codepoint, input+3);
42+
utf8proc_encode_char(0xc8e0 + (codepoint - 0x11a7), combined);
43+
check_compare("NFC", input, combined, utf8proc_NFC(input), 1);
44+
}
45+
}
46+
2847
int main(void)
2948
{
3049
issue128();
3150
issue102();
51+
issue317();
3252
#ifdef UNICODE_VERSION
3353
printf("Unicode version: Makefile has %s, has API %s\n", UNICODE_VERSION, utf8proc_unicode_version());
3454
check(!strcmp(UNICODE_VERSION, utf8proc_unicode_version()), "utf8proc_unicode_version mismatch");

utf8proc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b
684684
(hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
685685
utf8proc_int32_t hangul_tindex;
686686
hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
687-
if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
687+
if (hangul_tindex > 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
688688
*starter += hangul_tindex;
689689
starter_property = NULL;
690690
continue;

0 commit comments

Comments
 (0)